scrapy · kmike · Nov 2, 2018 · Nov 17, 2018 · May 30, 2019 · Feb 9, 2022
diff --git a/.gitignore b/.gitignore
@@ -28,6 +28,7 @@ pip-log.txt
 nosetests.xml
 htmlcov
 .pytest_cache
+coverage.xml
 
 # Translations
 *.mo

diff --git a/parsel/selector.py b/parsel/selector.py
@@ -6,6 +6,8 @@
 from typing import Any, Dict, List, Optional, Mapping, Pattern, Union
 
 from lxml import etree, html
+from lxml.html.clean import Cleaner  # pylint: disable=no-name-in-module
+import html_text
 
 from .utils import flatten, iflatten, extract_regex, shorten
 from .csstranslator import HTMLTranslator, GenericTranslator
@@ -173,30 +175,64 @@ def re_first(
             return el
         return default
 
-    def getall(self) -> List[str]:
+    def getall(
+        self,
+        text: bool = False,
+        cleaner: Union[str, None, Cleaner] = "auto",
+        guess_punct_space: bool = True,
+        guess_layout: bool = True,
+    ) -> List[str]:
         """
         Call the ``.get()`` method for each element is this list and return
         their results flattened, as a list of strings.
+
+        ``text``, ``cleaner``, ``guess_punct_space`` and ``guess_layout``
+        options are passed to :meth:`~.Selector.get`; see
+        :meth:`~.Selector.get` for more details.
         """
-        return [x.get() for x in self]
+        return [
+            x.get(
+                text=text,
+                cleaner=cleaner,
+                guess_punct_space=guess_punct_space,
+                guess_layout=guess_layout,
+            )
+            for x in self
+        ]
 
     extract = getall
 
-    @typing.overload
-    def get(self, default: None = None) -> Optional[str]:
-        pass
-
-    @typing.overload
-    def get(self, default: str) -> str:
-        pass
-
-    def get(self, default: Optional[str] = None) -> Optional[str]:
+    # TODO: bring types back
+    # @typing.overload
+    # def get(self, default: None = None) -> Optional[str]:
+    #     pass
+    #
+    # @typing.overload
+    # def get(self, default: str) -> str:
+    #     pass
+    def get(
+        self,
+        default: Optional[str] = None,
+        text: bool = False,
+        cleaner: Union[str, None, Cleaner] = "auto",
+        guess_punct_space: bool = True,
+        guess_layout: bool = True,
+    ) -> Optional[str]:
         """
         Return the result of ``.get()`` for the first element in this list.
-        If the list is empty, return the default value.
+        If the list is empty, return the ``default`` value.
+
+        ``text``, ``cleaner``, ``guess_punct_space`` and ``guess_layout``
+        options are passed to :meth:`Selector.get`; see :meth:`~.Selector.get`
+        for more details.
         """
         for x in self:
-            return x.get()
+            return x.get(
+                text=text,
+                cleaner=cleaner,
+                guess_punct_space=guess_punct_space,
+                guess_layout=guess_layout,
+            )
         return default
 
     extract_first = get
@@ -257,6 +293,8 @@ class Selector:
     }
     _lxml_smart_strings = False
     selectorlist_cls = SelectorList["Selector"]
+    _text_cleaner = html_text.cleaner
+    _html_cleaner = Cleaner()
 
     def __init__(
         self,
@@ -412,33 +450,97 @@ def re_first(
             iflatten(self.re(regex, replace_entities=replace_entities)), default
         )
 
-    def get(self) -> str:
+    def get(
+        self,
+        text: bool = False,
+        cleaner: Union[str, None, Cleaner] = "auto",
+        guess_punct_space: bool = True,
+        guess_layout: bool = True,
+    ) -> str:
         """
         Serialize and return the matched nodes in a single string.
         Percent encoded content is unquoted.
-        """
+
+        When ``text`` is False (default), HTML or XML is extracted. Pass
+        ``text=True`` to extract text content (html-text library is used).
+        Text extraction algorithm assumes that the document is an HTML
+        document, and uses HTML-specific rules.
+
+        ``cleaner`` argument allows cleaning HTML before extracting the
+        content. Allowed values:
+
+        * "auto" (default) - don't clean when text=False, clean with
+          options tuned for text extraction when text=True;
+        * "text" - clean with options tuned for text extraction: elements
+          like ``<script>`` and ``<style>`` are removed, cleaning options
+          are tuned for speed, assuming text extraction is the end goal;
+        * "html" - use default ``lxml.html.clean.Cleaner``. This is useful
+          if you want to make .get() output more human-readable, but still
+          preserve HTML tags.
+        * None - don't clean, even when ``text=True``. Useful if you have
+          an already cleaned tree, e.g. after calling :meth:`Selector.cleaned`.
+        * custom ``lxml.html.clean.Cleaner`` objects are also supported.
+
+        ``guess_punct_space`` and ``guess_layout`` options allow to customize
+        text extraction algorithm. By default, when ``text=True``,
+        parsel tries to insert newlines and blank lines as appropriate,
+        and be smart about whitespaces around inline tags,
+        so that the text output looks similar to browser's.
+
+        Pass ``guess_punct_space=False`` to disable punctuation handling.
+        This option has no effect when ``text=False``.
+
+        Use ``guess_layout=False`` to avoid adding newlines - content will
+        be just a single line of text, using whitespaces as separators.
+        This option has no effect when ``text=False``.
+        """
+        sel = self
+        if cleaner == "auto":
+            if text:
+                sel = self.cleaned("text")
+        elif cleaner is not None:
+            sel = self.cleaned(cleaner)
+        tree = sel.root
+
+        if text:
+            return html_text.etree_to_text(
+                tree, guess_punct_space=guess_punct_space, guess_layout=guess_layout
+            )
+
         try:
             return etree.tostring(
-                self.root,
-                method=self._tostring_method,
-                encoding="unicode",
-                with_tail=False,
+                tree, method=self._tostring_method, encoding="unicode", with_tail=False
             )
         except (AttributeError, TypeError):
-            if self.root is True:
+            if tree is True:
                 return "1"
-            elif self.root is False:
+            elif tree is False:
                 return "0"
             else:
-                return str(self.root)
+                return str(tree)
 
     extract = get
 
-    def getall(self) -> List[str]:
+    def getall(
+        self,
+        text: bool = False,
+        cleaner: Union[str, None, Cleaner] = "auto",
+        guess_punct_space: bool = True,
+        guess_layout: bool = True,
+    ) -> List[str]:
         """
         Serialize and return the matched node in a 1-element list of strings.
+
+        See :meth:`~.Selector.get` for options.
         """
-        return [self.get()]
+        return [
+            self.get(
+                text=text,
+                cleaner=cleaner,
+                guess_punct_space=guess_punct_space,
+                guess_layout=guess_layout,
+            )
+        ]
 
     def register_namespace(self, prefix: str, uri: str) -> None:
         """
@@ -492,6 +594,34 @@ def attrib(self) -> Dict[str, str]:
         """Return the attributes dictionary for underlying element."""
         return dict(self.root.attrib)
 
+    def cleaned(
+        self: _SelectorType, cleaner: Union[str, Cleaner] = "auto"
+    ) -> _SelectorType:
+        """
+        Return a copy of a Selector, with underlying subtree cleaned.
+        Allowed values of ``cleaner`` argument:
+
+        * "html" (default) - use default ``lxml.html.clean.Cleaner``;
+        * "text" - clean with options tuned for text extraction: elements
+          like ``<script>`` and ``<style>`` are removed, cleaning options
+          are tuned for speed, assuming text extraction is the end goal;
+        * custom ``lxml.html.clean.Cleaner`` objects are also supported.
+        """
+        if isinstance(cleaner, str):
+            if cleaner not in {"html", "text"}:
+                raise ValueError(
+                    "cleaner must be 'html', 'text' or "
+                    "an lxml.html.clean.Cleaner instance"
+                )
+        if cleaner == "html":
+            cleaner = self._html_cleaner
+        elif cleaner == "text":
+            cleaner = self._text_cleaner
+        root = cleaner.clean_html(self.root)
+        return self.__class__(
+            root=root, _expr=self._expr, namespaces=self.namespaces, type=self.type
+        )
+
     def __bool__(self) -> bool:
         """
         Return ``True`` if there is any real content selected or ``False``

diff --git a/setup.py b/setup.py
@@ -28,6 +28,7 @@
         "cssselect>=0.9",
         "lxml",
         "w3lib>=1.19.0",
+        "html-text>=0.5.2",
     ],
     python_requires=">=3.6",
     license="BSD",

diff --git a/tests/test_xml_attacks.py b/tests/test_xml_attacks.py
@@ -8,7 +8,7 @@
 from parsel import Selector
 
 
-MiB_1 = 1024 ** 2
+MiB_1 = 1024**2
 
 
 def _load(attack):

diff --git a/tests/typing/selector.py b/tests/typing/selector.py
@@ -1,5 +1,6 @@
 # Basic usage of the Selector, strongly typed to test the typing of parsel's API.
 import re
+from typing import List
 from parsel import Selector
 
 
@@ -8,9 +9,9 @@ def correct() -> None:
         text="<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>"
     )
 
-    li_values: list[str] = selector.css("li").getall()
+    li_values: List[str] = selector.css("li").getall()
     selector.re_first(re.compile(r"[32]"), "").strip()
-    xpath_values: list[str] = selector.xpath(
+    xpath_values: List[str] = selector.xpath(
         "//somens:a/text()", namespaces={"somens": "http://scrapy.org"}
     ).extract()