Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] text extraction in Selector and SelectorList #127

Open
wants to merge 27 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
3c471b8
[tmp] Selector.text and SelectorList.text methods
kmike Nov 2, 2018
8dea4ce
[wip] move converting to text to .get method, add getall support, .cl…
kmike Nov 17, 2018
da7bb80
bump html-text required version number
kmike May 30, 2019
859044c
Merge branch 'master' into selector-text
kmike Feb 9, 2022
7bae279
selector text unit tests
shahidkarimi Mar 11, 2022
e4733ee
code formtting
shahidkarimi Mar 11, 2022
857ca72
code formatting improvements
shahidkarimi Mar 11, 2022
7941093
removed unwated tests
shahidkarimi Apr 4, 2022
102f2e3
Merge pull request #236 from shahidkarimi/selector-text-tests
kmike May 20, 2022
1f917bb
Merge branch 'master' into selector-text
kmike Jun 28, 2022
d87982d
apply black
kmike Jun 28, 2022
14dadbd
fixed failing test
kmike Jun 28, 2022
af0d28a
Make new arguments keyword-only
kmike Jun 28, 2022
1737f83
documentation for selector .get() text
shahidkarimi Aug 12, 2022
17ae5e0
suggested changes in the PR fixed
shahidkarimi Aug 26, 2022
f8f1c66
Merge branch 'master' into selector-text
kmike Nov 10, 2022
c6580cc
Update docs/usage.rst
kmike Nov 13, 2022
419af4b
Merge pull request #248 from shahidkarimi/selector-text-doc
kmike Nov 13, 2022
b8d0352
Merge branch 'master' into selector-text
kmike Apr 24, 2024
ee3e734
fixed typing
kmike May 1, 2024
69456c1
fixed a refactoring issue
kmike May 1, 2024
a492278
document O(N^2) gotcha
kmike May 8, 2024
8b4ae25
make flake8 config compatible with black
kmike May 8, 2024
ccaaa5b
refactor text and cleaning tests; add more of them
kmike May 8, 2024
4eea4fa
fixed default .cleaned cleaner value
kmike May 8, 2024
27c9919
fixed black formatting went wrong
kmike May 8, 2024
852bbef
fix docs references
kmike May 8, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ pip-log.txt
nosetests.xml
htmlcov
.pytest_cache
coverage.xml

# Translations
*.mo
Expand Down
178 changes: 154 additions & 24 deletions parsel/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from typing import Any, Dict, List, Optional, Mapping, Pattern, Union

from lxml import etree, html
from lxml.html.clean import Cleaner # pylint: disable=no-name-in-module
import html_text

from .utils import flatten, iflatten, extract_regex, shorten
from .csstranslator import HTMLTranslator, GenericTranslator
Expand Down Expand Up @@ -173,30 +175,64 @@ def re_first(
return el
return default

def getall(self) -> List[str]:
def getall(
self,
text: bool = False,
cleaner: Union[str, None, Cleaner] = "auto",
guess_punct_space: bool = True,
guess_layout: bool = True,
kmike marked this conversation as resolved.
Show resolved Hide resolved
) -> List[str]:
"""
Call the ``.get()`` method for each element is this list and return
their results flattened, as a list of strings.

``text``, ``cleaner``, ``guess_punct_space`` and ``guess_layout``
options are passed to :meth:`~.Selector.get`; see
:meth:`~.Selector.get` for more details.
"""
return [x.get() for x in self]
return [
x.get(
text=text,
cleaner=cleaner,
guess_punct_space=guess_punct_space,
guess_layout=guess_layout,
)
for x in self
]

extract = getall

@typing.overload
def get(self, default: None = None) -> Optional[str]:
pass

@typing.overload
def get(self, default: str) -> str:
pass

def get(self, default: Optional[str] = None) -> Optional[str]:
# TODO: bring types back
# @typing.overload
# def get(self, default: None = None) -> Optional[str]:
# pass
#
# @typing.overload
# def get(self, default: str) -> str:
# pass
def get(
self,
default: Optional[str] = None,
text: bool = False,
cleaner: Union[str, None, Cleaner] = "auto",
guess_punct_space: bool = True,
guess_layout: bool = True,
) -> Optional[str]:
"""
Return the result of ``.get()`` for the first element in this list.
If the list is empty, return the default value.
If the list is empty, return the ``default`` value.

``text``, ``cleaner``, ``guess_punct_space`` and ``guess_layout``
options are passed to :meth:`Selector.get`; see :meth:`~.Selector.get`
for more details.
"""
for x in self:
return x.get()
return x.get(
text=text,
cleaner=cleaner,
guess_punct_space=guess_punct_space,
guess_layout=guess_layout,
)
return default

extract_first = get
Expand Down Expand Up @@ -257,6 +293,8 @@ class Selector:
}
_lxml_smart_strings = False
selectorlist_cls = SelectorList["Selector"]
_text_cleaner = html_text.cleaner
_html_cleaner = Cleaner()

def __init__(
self,
Expand Down Expand Up @@ -412,33 +450,97 @@ def re_first(
iflatten(self.re(regex, replace_entities=replace_entities)), default
)

def get(self) -> str:
def get(
self,
text: bool = False,
cleaner: Union[str, None, Cleaner] = "auto",
guess_punct_space: bool = True,
guess_layout: bool = True,
) -> str:
"""
Serialize and return the matched nodes in a single string.
Percent encoded content is unquoted.
"""

When ``text`` is False (default), HTML or XML is extracted. Pass
``text=True`` to extract text content (html-text library is used).
Text extraction algorithm assumes that the document is an HTML
document, and uses HTML-specific rules.

``cleaner`` argument allows cleaning HTML before extracting the
content. Allowed values:

* "auto" (default) - don't clean when text=False, clean with
options tuned for text extraction when text=True;
* "text" - clean with options tuned for text extraction: elements
like ``<script>`` and ``<style>`` are removed, cleaning options
are tuned for speed, assuming text extraction is the end goal;
* "html" - use default ``lxml.html.clean.Cleaner``. This is useful
if you want to make .get() output more human-readable, but still
preserve HTML tags.
* None - don't clean, even when ``text=True``. Useful if you have
an already cleaned tree, e.g. after calling :meth:`Selector.cleaned`.
* custom ``lxml.html.clean.Cleaner`` objects are also supported.

``guess_punct_space`` and ``guess_layout`` options allow to customize
text extraction algorithm. By default, when ``text=True``,
parsel tries to insert newlines and blank lines as appropriate,
and be smart about whitespaces around inline tags,
so that the text output looks similar to browser's.

Pass ``guess_punct_space=False`` to disable punctuation handling.
This option has no effect when ``text=False``.

Use ``guess_layout=False`` to avoid adding newlines - content will
be just a single line of text, using whitespaces as separators.
This option has no effect when ``text=False``.
"""
sel = self
if cleaner == "auto":
if text:
sel = self.cleaned("text")
elif cleaner is not None:
sel = self.cleaned(cleaner)
tree = sel.root

if text:
return html_text.etree_to_text(
tree, guess_punct_space=guess_punct_space, guess_layout=guess_layout
)

try:
return etree.tostring(
self.root,
method=self._tostring_method,
encoding="unicode",
with_tail=False,
tree, method=self._tostring_method, encoding="unicode", with_tail=False
)
except (AttributeError, TypeError):
if self.root is True:
if tree is True:
return "1"
elif self.root is False:
elif tree is False:
return "0"
else:
return str(self.root)
return str(tree)

extract = get

def getall(self) -> List[str]:
def getall(
self,
text: bool = False,
cleaner: Union[str, None, Cleaner] = "auto",
guess_punct_space: bool = True,
guess_layout: bool = True,
) -> List[str]:
"""
Serialize and return the matched node in a 1-element list of strings.

See :meth:`~.Selector.get` for options.
"""
return [self.get()]
return [
self.get(
text=text,
cleaner=cleaner,
guess_punct_space=guess_punct_space,
guess_layout=guess_layout,
)
]

def register_namespace(self, prefix: str, uri: str) -> None:
"""
Expand Down Expand Up @@ -492,6 +594,34 @@ def attrib(self) -> Dict[str, str]:
"""Return the attributes dictionary for underlying element."""
return dict(self.root.attrib)

def cleaned(
self: _SelectorType, cleaner: Union[str, Cleaner] = "auto"
) -> _SelectorType:
"""
Return a copy of a Selector, with underlying subtree cleaned.
Allowed values of ``cleaner`` argument:

* "html" (default) - use default ``lxml.html.clean.Cleaner``;
* "text" - clean with options tuned for text extraction: elements
like ``<script>`` and ``<style>`` are removed, cleaning options
are tuned for speed, assuming text extraction is the end goal;
* custom ``lxml.html.clean.Cleaner`` objects are also supported.
"""
if isinstance(cleaner, str):
if cleaner not in {"html", "text"}:
raise ValueError(
"cleaner must be 'html', 'text' or "
"an lxml.html.clean.Cleaner instance"
)
if cleaner == "html":
cleaner = self._html_cleaner
elif cleaner == "text":
cleaner = self._text_cleaner
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

an alternative is make these attributes public, and ask users to pass them: sel.cleaned(sel.TEXT_CLEANER) instead of sel.cleaned('text').

root = cleaner.clean_html(self.root)
return self.__class__(
root=root, _expr=self._expr, namespaces=self.namespaces, type=self.type
)

def __bool__(self) -> bool:
"""
Return ``True`` if there is any real content selected or ``False``
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
"cssselect>=0.9",
"lxml",
"w3lib>=1.19.0",
"html-text>=0.5.2",
],
python_requires=">=3.6",
license="BSD",
Expand Down
2 changes: 1 addition & 1 deletion tests/test_xml_attacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from parsel import Selector


MiB_1 = 1024 ** 2
MiB_1 = 1024**2


def _load(attack):
Expand Down
5 changes: 3 additions & 2 deletions tests/typing/selector.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Basic usage of the Selector, strongly typed to test the typing of parsel's API.
import re
from typing import List
from parsel import Selector


Expand All @@ -8,9 +9,9 @@ def correct() -> None:
text="<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>"
)

li_values: list[str] = selector.css("li").getall()
li_values: List[str] = selector.css("li").getall()
selector.re_first(re.compile(r"[32]"), "").strip()
xpath_values: list[str] = selector.xpath(
xpath_values: List[str] = selector.xpath(
"//somens:a/text()", namespaces={"somens": "http://scrapy.org"}
).extract()

Expand Down