diff --git a/.bandit.yml b/.bandit.yml index f4d993c2..4f60a02f 100644 --- a/.bandit.yml +++ b/.bandit.yml @@ -1,3 +1,6 @@ skips: +- B101 +- B311 - B320 - B410 +exclude_dirs: ['tests'] diff --git a/.bumpversion.cfg b/.bumpversion.cfg index a0e930d9..adaa9807 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.7.0 +current_version = 1.9.1 commit = True tag = True tag_name = v{new_version} diff --git a/.coveragerc b/.coveragerc index a0e59ef0..ba07b2fb 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,6 +1,5 @@ [run] branch = true -include = parsel/* [report] exclude_lines = diff --git a/.flake8 b/.flake8 index fe2937e4..7e5efc63 100644 --- a/.flake8 +++ b/.flake8 @@ -1,5 +1,5 @@ [flake8] -ignore = E203 +ignore = E203,W503 per-file-ignores = docs/conftest.py:E501 parsel/csstranslator.py:E501 @@ -9,6 +9,7 @@ per-file-ignores = setup.py:E501 tests/test_selector.py:E501 tests/test_selector_csstranslator.py:E501 + tests/test_selector_jmespath.py:E501 tests/test_utils.py:E501 tests/test_xpathfuncs.py:E501 tests/typing/*.py:E,F diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 00000000..00d5546f --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,2 @@ +# applying pre-commit hooks to the project +a57c23e3b7be0f001595bd8767fe05e40a66e730 \ No newline at end of file diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index ef4487e0..b06e7901 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -8,30 +8,27 @@ jobs: fail-fast: false matrix: include: - - python-version: "3.11" + - python-version: "3.12" env: - TOXENV: security - - python-version: "3.11" - env: - TOXENV: flake8 - - python-version: "3.11" + TOXENV: pre-commit + - python-version: "3.12" env: TOXENV: pylint - - python-version: "3.11" # Keep in sync with .readthedocs.yml + - python-version: "3.12" env: TOXENV: docs - - python-version: "3.11" + - python-version: "3.12" env: TOXENV: typing - - python-version: "3.11" + - python-version: "3.12" env: - TOXENV: black + TOXENV: twinecheck steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index c4b017d9..ebbb8dff 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -7,12 +7,12 @@ jobs: if: startsWith(github.event.ref, 'refs/tags/') steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - - name: Set up Python 3.10 - uses: actions/setup-python@v4 + - name: Set up Python 3.12 + uses: actions/setup-python@v5 with: - python-version: "3.10" + python-version: "3.12" - name: Check Tag id: check-release-tag diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b86ab67b..eb2561d0 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -8,9 +8,6 @@ jobs: fail-fast: false matrix: include: - - python-version: "3.7" - env: - TOXENV: py - python-version: "3.8" env: TOXENV: py @@ -23,21 +20,24 @@ jobs: - python-version: "3.11" env: TOXENV: py - - python-version: pypy3.9 + - python-version: "3.12" + env: + TOXENV: py + - python-version: pypy3.10 env: TOXENV: pypy3 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Install system libraries - if: contains(matrix.python-version, 'pypy3.9') + if: contains(matrix.python-version, 'pypy') run: | sudo apt-get update sudo apt-get install libxml2-dev libxslt-dev - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} diff --git a/.gitignore b/.gitignore index 8d344f16..20dec10b 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,7 @@ pip-log.txt # Unit test / coverage reports .coverage +/coverage.xml .tox nosetests.xml htmlcov diff --git a/.isort.cfg b/.isort.cfg new file mode 100644 index 00000000..6860bdb0 --- /dev/null +++ b/.isort.cfg @@ -0,0 +1,2 @@ +[settings] +profile = black \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..db43480a --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,18 @@ +repos: +- repo: https://github.com/PyCQA/bandit + rev: 1.7.8 + hooks: + - id: bandit + args: [-r, -c, .bandit.yml] +- repo: https://github.com/PyCQA/flake8 + rev: 7.0.0 + hooks: + - id: flake8 +- repo: https://github.com/psf/black.git + rev: 24.2.0 + hooks: + - id: black +- repo: https://github.com/pycqa/isort + rev: 5.13.2 + hooks: + - id: isort diff --git a/.readthedocs.yml b/.readthedocs.yml index a6f8c799..d4f39082 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -8,7 +8,7 @@ build: tools: # For available versions, see: # https://docs.readthedocs.io/en/stable/config-file/v2.html#build-tools-python - python: "3.11" # Keep in sync with .github/workflows/checks.yml + python: "3.12" # Keep in sync with .github/workflows/checks.yml python: install: - requirements: docs/requirements.txt diff --git a/Makefile b/Makefile deleted file mode 100644 index 2a83dc87..00000000 --- a/Makefile +++ /dev/null @@ -1,74 +0,0 @@ -.PHONY: clean-pyc clean-build docs clean - -help: - @echo "clean - remove all build, test, coverage and Python artifacts" - @echo "clean-build - remove build artifacts" - @echo "clean-pyc - remove Python file artifacts" - @echo "clean-test - remove test and coverage artifacts" - @echo "lint - check style with flake8" - @echo "test - run tests quickly with the default Python" - @echo "test-all - run tests on every Python version with tox" - @echo "coverage - check code coverage quickly with the default Python" - @echo "docs - generate Sphinx HTML documentation, including API docs" - @echo "release - package and upload a release" - @echo "dist - package" - @echo "install - install the package to the active Python's site-packages" - -clean: clean-build clean-pyc clean-test - -clean-build: - rm -fr build/ - rm -fr dist/ - rm -fr .eggs/ - find . -name '*.egg-info' -exec rm -fr {} + - find . -name '*.egg' -exec rm -f {} + - -clean-pyc: - find . -name '*.pyc' -exec rm -f {} + - find . -name '*.pyo' -exec rm -f {} + - find . -name '*~' -exec rm -f {} + - find . -name '__pycache__' -exec rm -fr {} + - -clean-test: - rm -fr .tox/ - rm -f .coverage - rm -fr htmlcov/ - -lint: - flake8 parsel tests - -test: - nosetests --with-doctest --rednose -s -v - -test-all: - tox - -coverage: - coverage run --source parsel setup.py test - coverage report -m - coverage html - python -m webbrowser htmlcov/index.html - -docs: - ( python -c 'import sphinx_rtd_theme' 2>/dev/null || pip install sphinx_rtd_theme ) - rm -f docs/parsel.rst - rm -f docs/modules.rst - sphinx-apidoc -o docs/ parsel - $(MAKE) -C docs clean - $(MAKE) -C docs html - python -m webbrowser docs/_build/html/index.html - -servedocs: docs - watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D docs/ - -release: clean - python setup.py sdist upload - python setup.py bdist_wheel upload - -dist: clean - python setup.py sdist - python setup.py bdist_wheel - ls -l dist - -install: clean - python setup.py install diff --git a/NEWS b/NEWS index 2b6f41be..034727b9 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,51 @@ History ------- +1.9.1 (2024-04-08) +~~~~~~~~~~~~~~~~~~ + +* Removed the dependency on ``pytest-runner``. +* Removed the obsolete ``Makefile``. + +1.9.0 (2024-03-14) +~~~~~~~~~~~~~~~~~~ + +* Now requires ``cssselect >= 1.2.0`` (this minimum version was required since + 1.8.0 but that wasn't properly recorded) +* Removed support for Python 3.7 +* Added support for Python 3.12 and PyPy 3.10 +* Fixed an exception when calling ``__str__`` or ``__repr__`` on some JSON + selectors +* Code formatted with ``black`` +* CI fixes and improvements + +1.8.1 (2023-04-18) +~~~~~~~~~~~~~~~~~~ + +* Remove a Sphinx reference from NEWS to fix the PyPI description +* Add a ``twine check`` CI check to detect such problems + +1.8.0 (2023-04-18) +~~~~~~~~~~~~~~~~~~ + +* Add support for JMESPath: you can now create a selector for a JSON document + and call ``Selector.jmespath()``. See `the documentation`_ for more + information and examples. +* Selectors can now be constructed from ``bytes`` (using the ``body`` and + ``encoding`` arguments) instead of ``str`` (using the ``text`` argument), so + that there is no internal conversion from ``str`` to ``bytes`` and the memory + usage is lower. +* Typing improvements +* The ``pkg_resources`` module (which was absent from the requirements) is no + longer used +* Documentation build fixes +* New requirements: + + * ``jmespath`` + * ``typing_extensions`` (on Python 3.7) + + .. _the documentation: https://parsel.readthedocs.io/en/latest/usage.html + 1.7.0 (2022-11-01) ~~~~~~~~~~~~~~~~~~ diff --git a/README.rst b/README.rst index d5309ab9..7fdc75e0 100644 --- a/README.rst +++ b/README.rst @@ -19,9 +19,16 @@ Parsel :alt: Coverage report -Parsel is a BSD-licensed Python_ library to extract and remove data from HTML_ -and XML_ using XPath_ and CSS_ selectors, optionally combined with -`regular expressions`_. +Parsel is a BSD-licensed Python_ library to extract data from HTML_, JSON_, and +XML_ documents. + +It supports: + +- CSS_ and XPath_ expressions for HTML and XML documents + +- JMESPath_ expressions for JSON documents + +- `Regular expressions`_ Find the Parsel online documentation at https://parsel.readthedocs.org. @@ -30,15 +37,18 @@ Example (`open online demo`_): .. code-block:: python >>> from parsel import Selector - >>> selector = Selector(text=""" - -

Hello, Parsel!

- - - """) + >>> text = """ + + +

Hello, Parsel!

+ + + + """ + >>> selector = Selector(text=text) >>> selector.css('h1::text').get() 'Hello, Parsel!' >>> selector.xpath('//h1/text()').re(r'\w+') @@ -47,12 +57,18 @@ Example (`open online demo`_): ... print(li.xpath('.//@href').get()) http://example.com http://scrapy.org - + >>> selector.css('script::text').jmespath("a").get() + 'b' + >>> selector.css('script::text').jmespath("a").getall() + ['b', 'c'] .. _CSS: https://en.wikipedia.org/wiki/Cascading_Style_Sheets .. _HTML: https://en.wikipedia.org/wiki/HTML +.. _JMESPath: https://jmespath.org/ +.. _JSON: https://en.wikipedia.org/wiki/JSON .. _open online demo: https://colab.research.google.com/drive/149VFa6Px3wg7S3SEnUqk--TyBrKplxCN#forceEdit=true&sandboxMode=true .. _Python: https://www.python.org/ .. _regular expressions: https://docs.python.org/library/re.html .. _XML: https://en.wikipedia.org/wiki/XML .. _XPath: https://en.wikipedia.org/wiki/XPath + diff --git a/docs/conf.py b/docs/conf.py index 3e877e91..4d7b0d63 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -3,9 +3,6 @@ import os import sys -import parsel - - # Get the project root dir, which is the parent dir of this cwd = os.getcwd() project_root = os.path.dirname(cwd) @@ -15,6 +12,7 @@ # version is used. sys.path.insert(0, project_root) +import parsel # noqa: E402 # -- General configuration --------------------------------------------- @@ -98,10 +96,9 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ("index", "parsel", "Parsel Documentation", ["Scrapy Project"], 1) + ("index", "parsel", "Parsel Documentation", ["Scrapy Project"], 1), ] - # -- Options for Texinfo output ---------------------------------------- # Grouping the document tree into Texinfo files. List of tuples @@ -134,6 +131,8 @@ # nitpicky = True # https://github.com/scrapy/cssselect/pull/110 nitpick_ignore = [ + ("py:class", "ExpressionError"), + ("py:class", "SelectorSyntaxError"), ("py:class", "cssselect.xpath.GenericTranslator"), ("py:class", "cssselect.xpath.HTMLTranslator"), ("py:class", "cssselect.xpath.XPathExpr"), diff --git a/docs/usage.rst b/docs/usage.rst index d2c08cd6..7cfa2fce 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -4,32 +4,38 @@ Usage ===== -Create a :class:`~parsel.selector.Selector` object for the HTML or XML text -that you want to parse:: +Create a :class:`~parsel.selector.Selector` object for your input text. + +For HTML or XML, use `CSS`_ or `XPath`_ expressions to select data:: >>> from parsel import Selector - >>> text = "

Hello, Parsel!

" - >>> selector = Selector(text=text) + >>> html_text = "

Hello, Parsel!

" + >>> html_selector = Selector(text=html_text) + >>> html_selector.css('h1') + [] + >>> html_selector.xpath('//h1') # the same, but now with XPath + [] -Then use `CSS`_ or `XPath`_ expressions to select elements:: +For JSON, use `JMESPath`_ expressions to select data:: - >>> selector.css('h1') - [] - >>> selector.xpath('//h1') # the same, but now with XPath - [] + >>> json_text = '{"title":"Hello, Parsel!"}' + >>> json_selector = Selector(text=json_text) + >>> json_selector.jmespath('title') + [] And extract data from those elements:: - >>> selector.css('h1::text').get() + >>> html_selector.xpath('//h1/text()').get() 'Hello, Parsel!' - >>> selector.xpath('//h1/text()').getall() + >>> json_selector.jmespath('title').getall() ['Hello, Parsel!'] .. _CSS: https://www.w3.org/TR/selectors .. _XPath: https://www.w3.org/TR/xpath +.. _JMESPath: https://jmespath.org/ -Learning CSS and XPath -====================== +Learning expression languages +============================= `CSS`_ is a language for applying styles to HTML documents. It defines selectors to associate those styles with specific HTML elements. Resources to @@ -39,6 +45,11 @@ learn CSS_ selectors include: - `XPath/CSS Equivalents in Wikibooks`_ +Parsel support for CSS selectors comes from cssselect, so read about `CSS +selectors supported by cssselect`_. + +.. _CSS selectors supported by cssselect: https://cssselect.readthedocs.io/en/latest/#supported-selectors + `XPath`_ is a language for selecting nodes in XML documents, which can also be used with HTML. Resources to learn XPath_ include: @@ -46,13 +57,22 @@ used with HTML. Resources to learn XPath_ include: - `XPath cheatsheet`_ -You can use either CSS_ or XPath_. CSS_ is usually more readable, but some -things can only be done with XPath_. +For HTML and XML input, you can use either CSS_ or XPath_. CSS_ is usually +more readable, but some things can only be done with XPath_. + +JMESPath_ allows you to declaratively specify how to extract elements from +a JSON document. Resources to learn JMESPath_ include: + +- `JMESPath Tutorial`_ + +- `JMESPath Specification`_ .. _CSS selectors in the MDN: https://developer.mozilla.org/en-US/docs/Learn/CSS/Building_blocks/Selectors .. _XPath cheatsheet: https://devhints.io/xpath .. _XPath Tutorial in W3Schools: https://www.w3schools.com/xml/xpath_intro.asp .. _XPath/CSS Equivalents in Wikibooks: https://en.wikibooks.org/wiki/XPath/CSS_Equivalents +.. _JMESPath Tutorial: https://jmespath.org/tutorial.html +.. _JMESPath Specification: https://jmespath.org/specification.html Using selectors @@ -95,12 +115,12 @@ So, by looking at the :ref:`HTML code ` of that page, let's construct an XPath for selecting the text inside the title tag:: >>> selector.xpath('//title/text()') - [] + [] You can also ask the same thing using CSS instead:: >>> selector.css('title::text') - [] + [] To actually extract the textual data, you must call the selector ``.get()`` or ``.getall()`` methods, as follows:: @@ -124,10 +144,10 @@ To extract all text of one or more element and all their child elements, formatted as plain text taking into account HTML tags (e.g. ``
`` is translated as a line break), set ``text=True`` in your call to :meth:`~Selector.get` or :meth:`~Selector.getall` instead of including -``::text`` (CSS) or ``/text()`` (XPath) in your query: +``::text`` (CSS) or ``/text()`` (XPath) in your query:: ->>> selector.css('#images').get(text=True) -'Name: My image 1\nName: My image 2\nName: My image 3\nName: My image 4\nName: My image 5' + >>> selector.css('#images').get(text=True) + 'Name: My image 1\nName: My image 2\nName: My image 3\nName: My image 4\nName: My image 5' See :meth:`Selector.get` for additional parameters that you can use to change how the extracted plain text is formatted. @@ -609,10 +629,10 @@ returns ``True`` for nodes that have all of the specified HTML classes:: ... """) ... >>> sel.xpath('//p[has-class("foo")]') - [, - ] + [, + ] >>> sel.xpath('//p[has-class("foo", "bar-baz")]') - [] + [] >>> sel.xpath('//p[has-class("foo", "bar")]') [] @@ -1023,8 +1043,8 @@ directly by their names:: >>> sel.remove_namespaces() >>> sel.xpath("//link") - [, - , + [, + , ...] If you wonder why the namespace removal procedure isn't called always by default @@ -1069,8 +1089,8 @@ And try to select the links again, now using an "atom:" prefix for the "link" node test:: >>> sel.xpath("//atom:link", namespaces={"atom": "http://www.w3.org/2005/Atom"}) - [, - , + [, + , ...] You can pass several namespaces (here we're using shorter 1-letter prefixes):: diff --git a/parsel/__init__.py b/parsel/__init__.py index a0d7f7cc..5fdbf350 100644 --- a/parsel/__init__.py +++ b/parsel/__init__.py @@ -5,7 +5,7 @@ __author__ = "Scrapy project" __email__ = "info@scrapy.org" -__version__ = "1.7.0" +__version__ = "1.9.1" __all__ = [ "Selector", "SelectorList", @@ -13,8 +13,8 @@ "xpathfuncs", ] -from parsel.selector import Selector, SelectorList # NOQA -from parsel.csstranslator import css2xpath # NOQA from parsel import xpathfuncs # NOQA +from parsel.csstranslator import css2xpath # NOQA +from parsel.selector import Selector, SelectorList # NOQA xpathfuncs.setup() diff --git a/parsel/csstranslator.py b/parsel/csstranslator.py index c240e6ac..80bfc7cf 100644 --- a/parsel/csstranslator.py +++ b/parsel/csstranslator.py @@ -1,27 +1,35 @@ from functools import lru_cache +from typing import TYPE_CHECKING, Any, Optional, Protocol from cssselect import GenericTranslator as OriginalGenericTranslator from cssselect import HTMLTranslator as OriginalHTMLTranslator -from cssselect.xpath import XPathExpr as OriginalXPathExpr +from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement from cssselect.xpath import ExpressionError -from cssselect.parser import FunctionalPseudoElement +from cssselect.xpath import XPathExpr as OriginalXPathExpr + +if TYPE_CHECKING: + # typing.Self requires Python 3.11 + from typing_extensions import Self class XPathExpr(OriginalXPathExpr): - textnode = False - attribute = None + textnode: bool = False + attribute: Optional[str] = None @classmethod - def from_xpath(cls, xpath, textnode=False, attribute=None): - x = cls( - path=xpath.path, element=xpath.element, condition=xpath.condition - ) + def from_xpath( + cls, + xpath: OriginalXPathExpr, + textnode: bool = False, + attribute: Optional[str] = None, + ) -> "Self": + x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition) x.textnode = textnode x.attribute = attribute return x - def __str__(self): + def __str__(self) -> str: path = super().__str__() if self.textnode: if path == "*": @@ -38,38 +46,63 @@ def __str__(self): return path - def join(self, combiner, other, *args, **kwargs): + def join( + self: "Self", + combiner: str, + other: OriginalXPathExpr, + *args: Any, + **kwargs: Any, + ) -> "Self": + if not isinstance(other, XPathExpr): + raise ValueError( + f"Expressions of type {__name__}.XPathExpr can ony join expressions" + f" of the same type (or its descendants), got {type(other)}" + ) super().join(combiner, other, *args, **kwargs) self.textnode = other.textnode self.attribute = other.attribute return self +# e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator +class TranslatorProtocol(Protocol): + def xpath_element(self, selector: Element) -> OriginalXPathExpr: + pass + + def css_to_xpath(self, css: str, prefix: str = ...) -> str: + pass + + class TranslatorMixin: """This mixin adds support to CSS pseudo elements via dynamic dispatch. Currently supported pseudo-elements are ``::text`` and ``::attr(ATTR_NAME)``. """ - def xpath_element(self, selector): - xpath = super().xpath_element(selector) + def xpath_element(self: TranslatorProtocol, selector: Element) -> XPathExpr: + # https://github.com/python/mypy/issues/12344 + xpath = super().xpath_element(selector) # type: ignore[safe-super] return XPathExpr.from_xpath(xpath) - def xpath_pseudo_element(self, xpath, pseudo_element): + def xpath_pseudo_element( + self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement + ) -> OriginalXPathExpr: """ Dispatch method that transforms XPath to support pseudo-element """ if isinstance(pseudo_element, FunctionalPseudoElement): - method = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element" - method = getattr(self, method, None) + method_name = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element" + method = getattr(self, method_name, None) if not method: raise ExpressionError( f"The functional pseudo-element ::{pseudo_element.name}() is unknown" ) xpath = method(xpath, pseudo_element) else: - method = f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element" - method = getattr(self, method, None) + method_name = ( + f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element" + ) + method = getattr(self, method_name, None) if not method: raise ExpressionError( f"The pseudo-element ::{pseudo_element} is unknown" @@ -77,36 +110,36 @@ def xpath_pseudo_element(self, xpath, pseudo_element): xpath = method(xpath) return xpath - def xpath_attr_functional_pseudo_element(self, xpath, function): + def xpath_attr_functional_pseudo_element( + self, xpath: OriginalXPathExpr, function: FunctionalPseudoElement + ) -> XPathExpr: """Support selecting attribute values using ::attr() pseudo-element""" if function.argument_types() not in (["STRING"], ["IDENT"]): raise ExpressionError( f"Expected a single string or ident for ::attr(), got {function.arguments!r}" ) - return XPathExpr.from_xpath( - xpath, attribute=function.arguments[0].value - ) + return XPathExpr.from_xpath(xpath, attribute=function.arguments[0].value) - def xpath_text_simple_pseudo_element(self, xpath): + def xpath_text_simple_pseudo_element(self, xpath: OriginalXPathExpr) -> XPathExpr: """Support selecting text nodes using ::text pseudo-element""" return XPathExpr.from_xpath(xpath, textnode=True) class GenericTranslator(TranslatorMixin, OriginalGenericTranslator): @lru_cache(maxsize=256) - def css_to_xpath(self, css, prefix="descendant-or-self::"): + def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str: return super().css_to_xpath(css, prefix) class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator): @lru_cache(maxsize=256) - def css_to_xpath(self, css, prefix="descendant-or-self::"): + def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str: return super().css_to_xpath(css, prefix) _translator = HTMLTranslator() -def css2xpath(query): +def css2xpath(query: str) -> str: "Return translated XPath version of a given CSS query" return _translator.css_to_xpath(query) diff --git a/parsel/selector.py b/parsel/selector.py index bb5312d2..d734f560 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -1,45 +1,43 @@ -""" -XPath selectors based on lxml -""" +"""XPath and JMESPath selectors based on the lxml and jmespath Python +packages.""" +import json import typing import warnings +from io import BytesIO from typing import ( Any, Dict, List, + Literal, Mapping, Optional, Pattern, + SupportsIndex, + Tuple, Type, + TypedDict, TypeVar, Union, ) from warnings import warn -from cssselect import GenericTranslator as OriginalGenericTranslator +import html_text # type: ignore[import] +import jmespath from lxml import etree, html from lxml.html.clean import Cleaner # pylint: disable=no-name-in-module -import html_text # type: ignore[import] from packaging.version import Version from .csstranslator import GenericTranslator, HTMLTranslator from .utils import extract_regex, flatten, iflatten, shorten - -if typing.TYPE_CHECKING: - # both require Python 3.8 - from typing import Literal, SupportsIndex - - # simplified _OutputMethodArg from types-lxml - _TostringMethodType = Literal[ - "html", - "xml", - ] - - _SelectorType = TypeVar("_SelectorType", bound="Selector") -_ParserType = Union[etree.XMLParser, etree.HTMLParser] +_ParserType = Union[etree.XMLParser, etree.HTMLParser] # type: ignore[type-arg] +# simplified _OutputMethodArg from types-lxml +_TostringMethodType = Literal[ + "html", + "xml", +] lxml_version = Version(etree.__version__) lxml_huge_tree_version = Version("4.2") @@ -58,13 +56,19 @@ class CannotDropElementWithoutParent(CannotRemoveElementWithoutParent): pass -class SafeXMLParser(etree.XMLParser): - def __init__(self, *args, **kwargs) -> None: +class SafeXMLParser(etree.XMLParser): # type: ignore[type-arg] + def __init__(self, *args: Any, **kwargs: Any) -> None: kwargs.setdefault("resolve_entities", False) super().__init__(*args, **kwargs) -_ctgroup = { +class CTGroupValue(TypedDict): + _parser: Union[Type[etree.XMLParser], Type[html.HTMLParser]] # type: ignore[type-arg] + _csstranslator: Union[GenericTranslator, HTMLTranslator] + _tostring_method: str + + +_ctgroup: Dict[str, CTGroupValue] = { "html": { "_parser": html.HTMLParser, "_csstranslator": HTMLTranslator(), @@ -78,13 +82,8 @@ def __init__(self, *args, **kwargs) -> None: } -def _st(st: Optional[str]) -> str: - if st is None: - return "html" - elif st in _ctgroup: - return st - else: - raise ValueError(f"Invalid type: {st}") +def _xml_or_html(type: Optional[str]) -> str: + return "xml" if type == "xml" else "html" def create_root_node( @@ -92,16 +91,21 @@ def create_root_node( parser_cls: Type[_ParserType], base_url: Optional[str] = None, huge_tree: bool = LXML_SUPPORTS_HUGE_TREE, + body: bytes = b"", + encoding: str = "utf8", ) -> etree._Element: """Create root node for text using given parser class.""" - body = text.strip().replace("\x00", "").encode("utf8") or b"" + if not text: + body = body.replace(b"\x00", b"").strip() + else: + body = text.strip().replace("\x00", "").encode(encoding) or b"" + if huge_tree and LXML_SUPPORTS_HUGE_TREE: - parser = parser_cls(recover=True, encoding="utf8", huge_tree=True) - # the stub wrongly thinks base_url can't be None - root = etree.fromstring(body, parser=parser, base_url=base_url) # type: ignore[arg-type] + parser = parser_cls(recover=True, encoding=encoding, huge_tree=True) + root = etree.fromstring(body, parser=parser, base_url=base_url) else: - parser = parser_cls(recover=True, encoding="utf8") - root = etree.fromstring(body, parser=parser, base_url=base_url) # type: ignore[arg-type] + parser = parser_cls(recover=True, encoding=encoding) + root = etree.fromstring(body, parser=parser, base_url=base_url) for error in parser.error_log: if "use XML_PARSE_HUGE option" in error.message: warnings.warn( @@ -132,26 +136,38 @@ def __getitem__( ) -> Union[_SelectorType, "SelectorList[_SelectorType]"]: o = super().__getitem__(pos) if isinstance(pos, slice): - return self.__class__( - typing.cast("SelectorList[_SelectorType]", o) - ) + return self.__class__(typing.cast("SelectorList[_SelectorType]", o)) else: return typing.cast(_SelectorType, o) def __getstate__(self) -> None: raise TypeError("can't pickle SelectorList objects") + def jmespath(self, query: str, **kwargs: Any) -> "SelectorList[_SelectorType]": + """ + Call the ``.jmespath()`` method for each element in this list and return + their results flattened as another :class:`SelectorList`. + + ``query`` is the same argument as the one in :meth:`Selector.jmespath`. + + Any additional named arguments are passed to the underlying + ``jmespath.search`` call, e.g.:: + + selector.jmespath('author.name', options=jmespath.Options(dict_cls=collections.OrderedDict)) + """ + return self.__class__(flatten([x.jmespath(query, **kwargs) for x in self])) + def xpath( self, xpath: str, namespaces: Optional[Mapping[str, str]] = None, - **kwargs, + **kwargs: Any, ) -> "SelectorList[_SelectorType]": """ Call the ``.xpath()`` method for each element in this list and return their results flattened as another :class:`SelectorList`. - ``query`` is the same argument as the one in :meth:`Selector.xpath` + ``xpath`` is the same argument as the one in :meth:`Selector.xpath` ``namespaces`` is an optional ``prefix: namespace-uri`` mapping (dict) for additional prefixes to those registered with ``register_namespace(prefix, uri)``. @@ -164,9 +180,7 @@ def xpath( selector.xpath('//a[href=$url]', url="http://www.example.com") """ return self.__class__( - flatten( - [x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self] - ) + flatten([x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self]) ) def css(self, query: str) -> "SelectorList[_SelectorType]": @@ -190,9 +204,7 @@ def re( Passing ``replace_entities`` as ``False`` switches off these replacements. """ - return flatten( - [x.re(regex, replace_entities=replace_entities) for x in self] - ) + return flatten([x.re(regex, replace_entities=replace_entities) for x in self]) @typing.overload def re_first( @@ -232,7 +244,7 @@ def re_first( for el in iflatten( x.re(regex, replace_entities=replace_entities) for x in self ): - return el + return typing.cast(str, el) return default def getall( @@ -328,15 +340,107 @@ def drop(self) -> None: x.drop() +_NOT_SET = object() + + +def _get_root_from_text(text: str, *, type: str, **lxml_kwargs: Any) -> etree._Element: + return create_root_node(text, _ctgroup[type]["_parser"], **lxml_kwargs) + + +def _get_root_and_type_from_bytes( + body: bytes, + encoding: str, + *, + input_type: Optional[str], + **lxml_kwargs: Any, +) -> Tuple[Any, str]: + if input_type == "text": + return body.decode(encoding), input_type + if encoding == "utf8": + try: + data = json.load(BytesIO(body)) + except ValueError: + data = _NOT_SET + if data is not _NOT_SET: + return data, "json" + if input_type == "json": + return None, "json" + assert input_type in ("html", "xml", None) # nosec + type = _xml_or_html(input_type) + root = create_root_node( + text="", + body=body, + encoding=encoding, + parser_cls=_ctgroup[type]["_parser"], + **lxml_kwargs, + ) + return root, type + + +def _get_root_and_type_from_text( + text: str, *, input_type: Optional[str], **lxml_kwargs: Any +) -> Tuple[Any, str]: + if input_type == "text": + return text, input_type + try: + data = json.loads(text) + except ValueError: + data = _NOT_SET + if data is not _NOT_SET: + return data, "json" + if input_type == "json": + return None, "json" + assert input_type in ("html", "xml", None) # nosec + type = _xml_or_html(input_type) + root = _get_root_from_text(text, type=type, **lxml_kwargs) + return root, type + + +def _get_root_type(root: Any, *, input_type: Optional[str]) -> str: + if isinstance(root, etree._Element): # pylint: disable=protected-access + if input_type in {"json", "text"}: + raise ValueError( + f"Selector got an lxml.etree._Element object as root, " + f"and {input_type!r} as type." + ) + return _xml_or_html(input_type) + elif isinstance(root, (dict, list)) or _is_valid_json(root): + return "json" + return input_type or "json" + + +def _is_valid_json(text: str) -> bool: + try: + json.loads(text) + except (TypeError, ValueError): + return False + else: + return True + + +def _load_json_or_none(text: str) -> Any: + if isinstance(text, (str, bytes, bytearray)): + try: + return json.loads(text) + except ValueError: + return None + return None + + class Selector: - """ - :class:`Selector` allows you to select parts of an XML or HTML text using CSS - or XPath expressions and extract data from it. + """Wrapper for input data in HTML, JSON, or XML format, that allows + selecting parts of it using selection expressions. + + You can write selection expressions in CSS or XPath for HTML and XML + inputs, or in JMESPath for JSON inputs. - ``text`` is a `str`` object + ``text`` is an ``str`` object. - ``type`` defines the selector type, it can be ``"html"``, ``"xml"`` or ``None`` (default). - If ``type`` is ``None``, the selector defaults to ``"html"``. + ``body`` is a ``bytes`` object. It can be used together with the + ``encoding`` argument instead of the ``text`` argument. + + ``type`` defines the selector type. It can be ``"html"`` (default), + ``"json"``, or ``"xml"``. ``base_url`` allows setting a URL for the document. This is needed when looking up external entities with relative paths. See the documentation for :func:`lxml.etree.fromstring` for more information. @@ -351,18 +455,16 @@ class Selector: """ __slots__ = [ - "text", "namespaces", "type", "_expr", + "_huge_tree", "root", + "_text", + "body", "__weakref__", - "_parser", - "_csstranslator", - "_tostring_method", ] - _default_type: Optional[str] = None _default_namespaces = { "re": "http://exslt.org/regular-expressions", # supported in libxslt: @@ -382,55 +484,139 @@ def __init__( self, text: Optional[str] = None, type: Optional[str] = None, + body: bytes = b"", + encoding: str = "utf8", namespaces: Optional[Mapping[str, str]] = None, - root: Optional[Any] = None, + root: Optional[Any] = _NOT_SET, base_url: Optional[str] = None, _expr: Optional[str] = None, huge_tree: bool = LXML_SUPPORTS_HUGE_TREE, ) -> None: - self.type = st = _st(type or self._default_type) - self._parser: Type[_ParserType] = typing.cast( - Type[_ParserType], _ctgroup[st]["_parser"] - ) - self._csstranslator: OriginalGenericTranslator = typing.cast( - OriginalGenericTranslator, _ctgroup[st]["_csstranslator"] - ) - self._tostring_method: "_TostringMethodType" = typing.cast( - "_TostringMethodType", _ctgroup[st]["_tostring_method"] - ) + self.root: Any + if type not in ("html", "json", "text", "xml", None): + raise ValueError(f"Invalid type: {type}") + + if text is None and not body and root is _NOT_SET: + raise ValueError("Selector needs text, body, or root arguments") + + if text is not None and not isinstance(text, str): + msg = f"text argument should be of type str, got {text.__class__}" + raise TypeError(msg) if text is not None: + if root is not _NOT_SET: + warnings.warn( + "Selector got both text and root, root is being ignored.", + stacklevel=2, + ) if not isinstance(text, str): msg = f"text argument should be of type str, got {text.__class__}" raise TypeError(msg) - root = self._get_root(text, base_url, huge_tree) - elif root is None: - raise ValueError("Selector needs either text or root argument") + + root, type = _get_root_and_type_from_text( + text, + input_type=type, + base_url=base_url, + huge_tree=huge_tree, + ) + self.root = root + self.type = type + elif body: + if not isinstance(body, bytes): + msg = f"body argument should be of type bytes, got {body.__class__}" + raise TypeError(msg) + root, type = _get_root_and_type_from_bytes( + body=body, + encoding=encoding, + input_type=type, + base_url=base_url, + huge_tree=huge_tree, + ) + self.root = root + self.type = type + elif root is _NOT_SET: + raise ValueError("Selector needs text, body, or root arguments") + else: + self.root = root + self.type = _get_root_type(root, input_type=type) self.namespaces = dict(self._default_namespaces) if namespaces is not None: self.namespaces.update(namespaces) - self.root = root + self._expr = _expr + self._huge_tree = huge_tree + self._text = text def __getstate__(self) -> Any: raise TypeError("can't pickle Selector objects") def _get_root( self, - text: str, + text: str = "", base_url: Optional[str] = None, huge_tree: bool = LXML_SUPPORTS_HUGE_TREE, + type: Optional[str] = None, + body: bytes = b"", + encoding: str = "utf8", ) -> etree._Element: return create_root_node( - text, self._parser, base_url=base_url, huge_tree=huge_tree + text, + body=body, + encoding=encoding, + parser_cls=_ctgroup[type or self.type]["_parser"], + base_url=base_url, + huge_tree=huge_tree, ) + def jmespath( + self: _SelectorType, + query: str, + **kwargs: Any, + ) -> SelectorList[_SelectorType]: + """ + Find objects matching the JMESPath ``query`` and return the result as a + :class:`SelectorList` instance with all elements flattened. List + elements implement :class:`Selector` interface too. + + ``query`` is a string containing the `JMESPath + `_ query to apply. + + Any additional named arguments are passed to the underlying + ``jmespath.search`` call, e.g.:: + + selector.jmespath('author.name', options=jmespath.Options(dict_cls=collections.OrderedDict)) + """ + if self.type == "json": + if isinstance(self.root, str): + # Selector received a JSON string as root. + data = _load_json_or_none(self.root) + else: + data = self.root + else: + assert self.type in {"html", "xml"} # nosec + data = _load_json_or_none(self.root.text) + + result = jmespath.search(query, data, **kwargs) + if result is None: + result = [] + elif not isinstance(result, list): + result = [result] + + def make_selector(x: Any) -> _SelectorType: # closure function + if isinstance(x, str): + return self.__class__(text=x, _expr=query, type="text") + else: + return self.__class__(root=x, _expr=query) + + result = [make_selector(x) for x in result] + return typing.cast(SelectorList[_SelectorType], self.selectorlist_cls(result)) + def xpath( self: _SelectorType, query: str, namespaces: Optional[Mapping[str, str]] = None, - **kwargs, + **kwargs: Any, ) -> SelectorList[_SelectorType]: """ Find nodes matching the xpath ``query`` and return the result as a @@ -449,12 +635,22 @@ def xpath( selector.xpath('//a[href=$url]', url="http://www.example.com") """ - try: - xpathev = self.root.xpath - except AttributeError: - return typing.cast( - SelectorList[_SelectorType], self.selectorlist_cls([]) - ) + if self.type not in ("html", "xml", "text"): + raise ValueError(f"Cannot use xpath on a Selector of type {self.type!r}") + if self.type in ("html", "xml"): + try: + xpathev = self.root.xpath + except AttributeError: + return typing.cast( + SelectorList[_SelectorType], self.selectorlist_cls([]) + ) + else: + try: + xpathev = self._get_root(self._text or "", type="html").xpath + except AttributeError: + return typing.cast( + SelectorList[_SelectorType], self.selectorlist_cls([]) + ) nsp = dict(self.namespaces) if namespaces is not None: @@ -474,13 +670,14 @@ def xpath( result = [ self.__class__( - root=x, _expr=query, namespaces=self.namespaces, type=self.type + root=x, + _expr=query, + namespaces=self.namespaces, + type=_xml_or_html(self.type), ) for x in result ] - return typing.cast( - SelectorList[_SelectorType], self.selectorlist_cls(result) - ) + return typing.cast(SelectorList[_SelectorType], self.selectorlist_cls(result)) def css(self: _SelectorType, query: str) -> SelectorList[_SelectorType]: """ @@ -493,10 +690,13 @@ def css(self: _SelectorType, query: str) -> SelectorList[_SelectorType]: .. _cssselect: https://pypi.python.org/pypi/cssselect/ """ + if self.type not in ("html", "xml", "text"): + raise ValueError(f"Cannot use css on a Selector of type {self.type!r}") return self.xpath(self._css2xpath(query)) def _css2xpath(self, query: str) -> str: - return self._csstranslator.css_to_xpath(query) + type = _xml_or_html(self.type) + return _ctgroup[type]["_csstranslator"].css_to_xpath(query) def re( self, regex: Union[str, Pattern[str]], replace_entities: bool = True @@ -513,9 +713,8 @@ def re( Passing ``replace_entities`` as ``False`` switches off these replacements. """ - return extract_regex( - regex, self.get(), replace_entities=replace_entities - ) + data = self.get() + return extract_regex(regex, data, replace_entities=replace_entities) @typing.overload def re_first( @@ -563,10 +762,12 @@ def get( cleaner: Union[str, None, Cleaner] = "auto", guess_punct_space: bool = True, guess_layout: bool = True, - ) -> str: + ) -> Any: """ - Serialize and return the matched nodes in a single string. - Percent encoded content is unquoted. + Serialize and return the matched nodes. + + For HTML and XML, the result is always a string, and percent-encoded + content is unquoted. When ``text`` is False (default), HTML or XML is extracted. Pass ``text=True`` to extract text content (html-text library is used). @@ -601,6 +802,10 @@ def get( be just a single line of text, using whitespaces as separators. This option has no effect when ``text=False``. """ + if self.type in ("text", "json"): + # TODO: what should be the behavior with text=True? + return self.root + sel = self if cleaner == "auto": if text: @@ -617,11 +822,14 @@ def get( ) try: - return etree.tostring( - tree, - method=self._tostring_method, - encoding="unicode", - with_tail=False, + return typing.cast( + str, + etree.tostring( + tree, + method=_ctgroup[self.type]["_tostring_method"], + encoding="unicode", + with_tail=False, + ), ) except (AttributeError, TypeError): if tree is True: @@ -674,10 +882,7 @@ def remove_namespaces(self) -> None: # loop on element attributes also for an in el.attrib: if an.startswith("{"): - # this cast shouldn't be needed as pop never returns None - el.attrib[an.split("}", 1)[1]] = typing.cast( - str, el.attrib.pop(an) - ) + el.attrib[an.split("}", 1)[1]] = el.attrib.pop(an) # remove namespace declarations etree.cleanup_namespaces(self.root) @@ -702,7 +907,7 @@ def remove(self) -> None: ) try: - parent.remove(self.root) # type: ignore[union-attr] + parent.remove(self.root) except AttributeError: # 'NoneType' object has no attribute 'remove' raise CannotRemoveElementWithoutParent( @@ -710,7 +915,7 @@ def remove(self) -> None: "are you trying to remove a root element?" ) - def drop(self): + def drop(self) -> None: """ Drop matched nodes from the parent element. """ @@ -727,9 +932,11 @@ def drop(self): try: if self.type == "xml": + if parent is None: + raise ValueError("This node has no parent") parent.remove(self.root) else: - self.root.drop_tree() + typing.cast(html.HtmlElement, self.root).drop_tree() except (AttributeError, AssertionError): # 'NoneType' object has no attribute 'drop' raise CannotDropElementWithoutParent( @@ -787,7 +994,8 @@ def __bool__(self) -> bool: __nonzero__ = __bool__ def __str__(self) -> str: - data = repr(shorten(self.get(), width=40)) - return f"<{type(self).__name__} xpath={self._expr!r} data={data}>" + return str(self.get()) - __repr__ = __str__ + def __repr__(self) -> str: + data = repr(shorten(str(self.get()), width=40)) + return f"<{type(self).__name__} query={self._expr!r} data={data}>" diff --git a/parsel/utils.py b/parsel/utils.py index 5e6d92de..ec77d74b 100644 --- a/parsel/utils.py +++ b/parsel/utils.py @@ -1,9 +1,10 @@ import re -from typing import Any, List, Pattern, Union, cast, Match +from typing import Any, Iterable, Iterator, List, Match, Pattern, Union, cast + from w3lib.html import replace_entities as w3lib_replace_entities -def flatten(x): +def flatten(x: Iterable[Any]) -> List[Any]: """flatten(sequence) -> list Returns a single, flat list which contains all elements retrieved from the sequence and all recursively contained sub-sequences @@ -21,7 +22,7 @@ def flatten(x): return list(iflatten(x)) -def iflatten(x): +def iflatten(x: Iterable[Any]) -> Iterator[Any]: """iflatten(sequence) -> Iterator Similar to ``.flatten()``, but returns iterator instead""" for el in x: diff --git a/parsel/xpathfuncs.py b/parsel/xpathfuncs.py index 9e5c0a96..7633d107 100644 --- a/parsel/xpathfuncs.py +++ b/parsel/xpathfuncs.py @@ -1,13 +1,14 @@ import re -from lxml import etree +from typing import Any, Callable, Optional +from lxml import etree from w3lib.html import HTML5_WHITESPACE regex = f"[{HTML5_WHITESPACE}]+" replace_html5_whitespaces = re.compile(regex).sub -def set_xpathfunc(fname, func): +def set_xpathfunc(fname: str, func: Optional[Callable]) -> None: # type: ignore[type-arg] """Register a custom extension function to use in XPath expressions. The function ``func`` registered under ``fname`` identifier will be called @@ -28,11 +29,11 @@ def set_xpathfunc(fname, func): del ns_fns[fname] -def setup(): +def setup() -> None: set_xpathfunc("has-class", has_class) -def has_class(context, *classes): +def has_class(context: Any, *classes: str) -> bool: """has-class function. Return True if all ``classes`` are present in element's class attr. @@ -40,14 +41,10 @@ def has_class(context, *classes): """ if not context.eval_context.get("args_checked"): if not classes: - raise ValueError( - "XPath error: has-class must have at least 1 argument" - ) + raise ValueError("XPath error: has-class must have at least 1 argument") for c in classes: if not isinstance(c, str): - raise ValueError( - "XPath error: has-class arguments must be strings" - ) + raise ValueError("XPath error: has-class arguments must be strings") context.eval_context["args_checked"] = True node_cls = context.context_node.get("class") diff --git a/pylintrc b/pylintrc index 1892721c..c909c457 100644 --- a/pylintrc +++ b/pylintrc @@ -4,7 +4,6 @@ persistent=no [MESSAGES CONTROL] disable=c-extension-no-member, - deprecated-method, fixme, import-error, import-outside-toplevel, @@ -16,6 +15,7 @@ disable=c-extension-no-member, no-else-return, no-member, parse-error, + protected-access, raise-missing-from, redefined-builtin, too-few-public-methods, @@ -26,4 +26,4 @@ disable=c-extension-no-member, unused-argument, use-a-generator, wrong-import-order, - wrong-import-position + wrong-import-position, diff --git a/setup.cfg b/setup.cfg index cf0f47f0..7c964b49 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,2 @@ [wheel] universal=1 - -[aliases] -test=pytest diff --git a/setup.py b/setup.py index 90db4dee..e88321f5 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,6 @@ from setuptools import setup - with open("README.rst", encoding="utf-8") as readme_file: readme = readme_file.read() @@ -11,9 +10,10 @@ setup( name="parsel", - version="1.7.0", + version="1.9.1", description="Parsel is a library to extract data from HTML and XML using XPath and CSS selectors", long_description=readme + "\n\n" + history, + long_description_content_type="text/x-rst", author="Scrapy project", author_email="info@scrapy.org", url="https://github.com/scrapy/parsel", @@ -25,13 +25,14 @@ }, include_package_data=True, install_requires=[ - "cssselect>=0.9", + "cssselect>=1.2.0", + "jmespath", "lxml", "packaging", "w3lib>=1.19.0", "html-text>=0.5.2", ], - python_requires=">=3.7", + python_requires=">=3.8", license="BSD", zip_safe=False, keywords="parsel", @@ -44,17 +45,14 @@ "Topic :: Text Processing :: Markup :: HTML", "Topic :: Text Processing :: Markup :: XML", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ], - setup_requires=[ - "pytest-runner", - ], tests_require=[ "pytest", ], diff --git a/tests/test_selector.py b/tests/test_selector.py index 47ce04a8..8b5e554f 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -1,18 +1,20 @@ +import pickle import re +import unittest import warnings import weakref -import unittest -import pickle -from typing import Any, cast +from typing import Any, Mapping, Optional, cast from lxml import etree from lxml.html import HtmlElement -from pkg_resources import parse_version +from packaging.version import Version from parsel import Selector, SelectorList from parsel.selector import ( - CannotRemoveElementWithoutRoot, + _NOT_SET, + LXML_SUPPORTS_HUGE_TREE, CannotRemoveElementWithoutParent, + CannotRemoveElementWithoutRoot, ) @@ -28,9 +30,7 @@ def assertIsSelectorList(self, value: Any) -> None: def test_pickle_selector(self) -> None: sel = self.sscls(text="

some text

") - self.assertRaises( - TypeError, lambda s: pickle.dumps(s, protocol=2), sel - ) + self.assertRaises(TypeError, lambda s: pickle.dumps(s, protocol=2), sel) def test_pickle_selector_list(self) -> None: sel = self.sscls( @@ -40,9 +40,7 @@ def test_pickle_selector_list(self) -> None: empty_sel_list = sel.css("p") self.assertIsSelectorList(sel_list) self.assertIsSelectorList(empty_sel_list) - self.assertRaises( - TypeError, lambda s: pickle.dumps(s, protocol=2), sel_list - ) + self.assertRaises(TypeError, lambda s: pickle.dumps(s, protocol=2), sel_list) self.assertRaises( TypeError, lambda s: pickle.dumps(s, protocol=2), empty_sel_list ) @@ -63,7 +61,8 @@ def test_simple_selection(self) -> None: ) self.assertEqual( - [x.extract() for x in sel.xpath("//input[@name='a']/@name")], ["a"] + [x.extract() for x in sel.xpath("//input[@name='a']/@name")], + ["a"], ) self.assertEqual( [ @@ -94,10 +93,7 @@ def test_simple_selection_with_variables(self) -> None: sel = self.sscls(text=body) self.assertEqual( - [ - x.extract() - for x in sel.xpath("//input[@value=$number]/@name", number=1) - ], + [x.extract() for x in sel.xpath("//input[@value=$number]/@name", number=1)], ["a"], ) self.assertEqual( @@ -119,15 +115,11 @@ def test_simple_selection_with_variables(self) -> None: # you can also pass booleans self.assertEqual( - sel.xpath( - "boolean(count(//input)=$cnt)=$test", cnt=2, test=True - ).extract(), + sel.xpath("boolean(count(//input)=$cnt)=$test", cnt=2, test=True).extract(), ["1"], ) self.assertEqual( - sel.xpath( - "boolean(count(//input)=$cnt)=$test", cnt=4, test=True - ).extract(), + sel.xpath("boolean(count(//input)=$cnt)=$test", cnt=4, test=True).extract(), ["0"], ) self.assertEqual( @@ -157,16 +149,11 @@ def test_simple_selection_with_variables_escape_friendly(self) -> None: t = 'I say "Yeah!"' # naive string formatting with give something like: # ValueError: XPath error: Invalid predicate in //input[@value="I say "Yeah!""]/@name - self.assertRaises( - ValueError, sel.xpath, f'//input[@value="{t}"]/@name' - ) + self.assertRaises(ValueError, sel.xpath, f'//input[@value="{t}"]/@name') # with XPath variables, escaping is done for you self.assertEqual( - [ - x.extract() - for x in sel.xpath("//input[@value=$text]/@name", text=t) - ], + [x.extract() for x in sel.xpath("//input[@value=$text]/@name", text=t)], ["a"], ) lt = """I'm mixing single and "double quotes" and I don't care :)""" @@ -179,9 +166,7 @@ def test_simple_selection_with_variables_escape_friendly(self) -> None: self.assertEqual( [ x.extract() - for x in sel.xpath( - "//p[normalize-space()=$lng]//@name", lng=lt - ) + for x in sel.xpath("//p[normalize-space()=$lng]//@name", lng=lt) ], ["a"], ) @@ -205,9 +190,7 @@ def test_accessing_attributes(self) -> None: ) # for a SelectorList, bring the attributes of first-element only - self.assertEqual( - {"id": "some-list", "class": "list-cls"}, sel.css("ul").attrib - ) + self.assertEqual({"id": "some-list", "class": "list-cls"}, sel.css("ul").attrib) self.assertEqual( {"class": "item-cls", "id": "list-item-1"}, sel.css("li").attrib ) @@ -227,9 +210,7 @@ def test_representation_slice(self) -> None: body = f"

" sel = self.sscls(text=body) - representation = ( - f"" - ) + representation = f"" self.assertEqual( [repr(it) for it in sel.xpath("//input/@name")], [representation] @@ -238,9 +219,7 @@ def test_representation_slice(self) -> None: def test_representation_unicode_query(self) -> None: body = f"

" - representation = ( - "" - ) + representation = "" sel = self.sscls(text=body) self.assertEqual( @@ -299,9 +278,7 @@ def test_selector_get_alias(self) -> None: self.assertEqual( sel.xpath("//ul/li[position()>1]")[0].get(), '
  • 2
  • ' ) - self.assertEqual( - sel.xpath("//ul/li[position()>1]/text()")[0].get(), "2" - ) + self.assertEqual(sel.xpath("//ul/li[position()>1]/text()")[0].get(), "2") def test_selector_getall_alias(self) -> None: """Test if get() returns extracted value on a Selector""" @@ -371,9 +348,7 @@ def test_extract_first_re_default(self) -> None: def test_select_unicode_query(self) -> None: body = "

    " sel = self.sscls(text=body) - self.assertEqual( - sel.xpath('//input[@name="\xa9"]/@value').extract(), ["1"] - ) + self.assertEqual(sel.xpath('//input[@name="\xa9"]/@value').extract(), ["1"]) def test_list_elements_type(self) -> None: """Test Selector returning the same type in selection methods""" @@ -390,12 +365,8 @@ def test_list_elements_type(self) -> None: def test_boolean_result(self) -> None: body = "

    " xs = self.sscls(text=body) - self.assertEqual( - xs.xpath("//input[@name='a']/@name='a'").extract(), ["1"] - ) - self.assertEqual( - xs.xpath("//input[@name='a']/@name='n'").extract(), ["0"] - ) + self.assertEqual(xs.xpath("//input[@name='a']/@name='a'").extract(), ["1"]) + self.assertEqual(xs.xpath("//input[@name='a']/@name='n'").extract(), ["0"]) def test_differences_parsing_xml_vs_html(self) -> None: """Test that XML and HTML Selector's behave differently""" @@ -419,7 +390,7 @@ def test_error_for_unknown_selector_type(self) -> None: def test_text_or_root_is_required(self) -> None: self.assertRaisesRegex( ValueError, - "Selector needs either text or root argument", + "Selector needs text, body, or root arguments", self.sscls, ) @@ -525,9 +496,7 @@ def test_mixed_nested_selectors(self) -> None: self.assertEqual( sel.xpath('//div[@id="1"]').css("span::text").extract(), ["me"] ) - self.assertEqual( - sel.css("#1").xpath("./span/text()").extract(), ["me"] - ) + self.assertEqual(sel.css("#1").xpath("./span/text()").extract(), ["me"]) def test_dont_strip(self) -> None: sel = self.sscls(text='
    fff: zzz
    ') @@ -558,7 +527,8 @@ def test_namespaces_adhoc(self) -> None: self.assertEqual( x.xpath( - "//somens:a/text()", namespaces={"somens": "http://scrapy.org"} + "//somens:a/text()", + namespaces={"somens": "http://scrapy.org"}, ).extract(), ["take this"], ) @@ -601,16 +571,12 @@ def test_namespaces_multiple(self) -> None: x.register_namespace("b", "http://somens.com") self.assertEqual(len(x.xpath("//xmlns:TestTag")), 1) self.assertEqual(x.xpath("//b:Operation/text()").extract()[0], "hello") - self.assertEqual( - x.xpath("//xmlns:TestTag/@b:att").extract()[0], "value" - ) + self.assertEqual(x.xpath("//xmlns:TestTag/@b:att").extract()[0], "value") self.assertEqual( x.xpath("//p:SecondTestTag/xmlns:price/text()").extract()[0], "90" ) self.assertEqual( - x.xpath("//p:SecondTestTag") - .xpath("./xmlns:price/text()")[0] - .extract(), + x.xpath("//p:SecondTestTag").xpath("./xmlns:price/text()")[0].extract(), "90", ) self.assertEqual( @@ -652,7 +618,8 @@ def test_namespaces_multiple_adhoc(self) -> None: # "xmlns" is still defined self.assertEqual( x.xpath( - "//xmlns:TestTag/@b:att", namespaces={"b": "http://somens.com"} + "//xmlns:TestTag/@b:att", + namespaces={"b": "http://somens.com"}, ).extract()[0], "value", ) @@ -706,9 +673,7 @@ def test_namespaces_multiple_adhoc(self) -> None: ) # "p" prefix is not cached from previous calls - self.assertRaises( - ValueError, x.xpath, "//p:SecondTestTag/xmlns:price/text()" - ) + self.assertRaises(ValueError, x.xpath, "//p:SecondTestTag/xmlns:price/text()") x.register_namespace("p", "http://www.scrapy.org/product") self.assertEqual( @@ -780,9 +745,7 @@ def test_re_replace_entities(self) -> None: ) self.assertEqual( - x.xpath("//script/text()").re_first( - name_re, replace_entities=False - ), + x.xpath("//script/text()").re_first(name_re, replace_entities=False), expected, ) self.assertEqual( @@ -793,15 +756,11 @@ def test_re_replace_entities(self) -> None: def test_re_intl(self) -> None: body = "
    Evento: cumplea\xf1os
    " x = self.sscls(text=body) - self.assertEqual( - x.xpath("//div").re(r"Evento: (\w+)"), ["cumplea\xf1os"] - ) + self.assertEqual(x.xpath("//div").re(r"Evento: (\w+)"), ["cumplea\xf1os"]) def test_selector_over_text(self) -> None: hs = self.sscls(text="lala") - self.assertEqual( - hs.extract(), "lala" - ) + self.assertEqual(hs.extract(), "lala") xs = self.sscls(text="lala", type="xml") self.assertEqual(xs.extract(), "lala") self.assertEqual(xs.xpath(".").extract(), ["lala"]) @@ -827,17 +786,13 @@ def test_http_header_encoding_precedence(self) -> None: \xa3""" x = self.sscls(text=text) - self.assertEqual( - x.xpath("//span[@id='blank']/text()").extract(), ["\xa3"] - ) + self.assertEqual(x.xpath("//span[@id='blank']/text()").extract(), ["\xa3"]) def test_empty_bodies_shouldnt_raise_errors(self) -> None: self.sscls(text="").xpath("//text()").extract() def test_bodies_with_comments_only(self) -> None: - sel = self.sscls( - text="", base_url="http://example.com" - ) + sel = self.sscls(text="", base_url="http://example.com") self.assertEqual("http://example.com", sel.root.base) def test_null_bytes_shouldnt_raise_errors(self) -> None: @@ -863,9 +818,7 @@ def test_select_on_unevaluable_nodes(self) -> None: self.assertEqual(x1.xpath(".//text()").extract(), []) def test_select_on_text_nodes(self) -> None: - r = self.sscls( - text="
    Options:opt1
    Otheropt2
    " - ) + r = self.sscls(text="
    Options:opt1
    Otheropt2
    ") x1 = r.xpath( "//div/descendant::text()[preceding-sibling::b[contains(text(), 'Options')]]" ) @@ -879,9 +832,7 @@ def test_select_on_text_nodes(self) -> None: @unittest.skip("Text nodes lost parent node reference in lxml") def test_nested_select_on_text_nodes(self) -> None: # FIXME: does not work with lxml backend [upstream] - r = self.sscls( - text="
    Options:opt1
    Otheropt2
    " - ) + r = self.sscls(text="
    Options:opt1
    Otheropt2
    ") x1 = r.xpath("//div/descendant::text()") x2 = x1.xpath("./preceding-sibling::b[contains(text(), 'Options')]") self.assertEqual(x2.extract(), ["Options:"]) @@ -934,17 +885,14 @@ def test_remove_namespaces_embedded(self) -> None: self.assertEqual( len( sel.xpath( - "//f:link", namespaces={"f": "http://www.w3.org/2005/Atom"} + "//f:link", + namespaces={"f": "http://www.w3.org/2005/Atom"}, ) ), 2, ) self.assertEqual( - len( - sel.xpath( - "//s:stop", namespaces={"s": "http://www.w3.org/2000/svg"} - ) - ), + len(sel.xpath("//s:stop", namespaces={"s": "http://www.w3.org/2000/svg"})), 2, ) sel.remove_namespaces() @@ -992,19 +940,13 @@ class SmartStringsSelector(Selector): li_text = x.xpath("//li/text()") self.assertFalse(any([hasattr(e.root, "getparent") for e in li_text])) div_class = x.xpath("//div/@class") - self.assertFalse( - any([hasattr(e.root, "getparent") for e in div_class]) - ) + self.assertFalse(any([hasattr(e.root, "getparent") for e in div_class])) smart_x = SmartStringsSelector(text=body) smart_li_text = smart_x.xpath("//li/text()") - self.assertTrue( - all([hasattr(e.root, "getparent") for e in smart_li_text]) - ) + self.assertTrue(all([hasattr(e.root, "getparent") for e in smart_li_text])) smart_div_class = smart_x.xpath("//div/@class") - self.assertTrue( - all([hasattr(e.root, "getparent") for e in smart_div_class]) - ) + self.assertTrue(all([hasattr(e.root, "getparent") for e in smart_div_class])) def test_xml_entity_expansion(self) -> None: malicious_xml = ( @@ -1029,7 +971,7 @@ class MySelector(Selector): selectorlist_cls = MySelectorList def extra_method(self) -> str: - return "extra" + self.get() + return "extra" + cast(str, self.get()) sel = MySelector(text="
    foo
    ") self.assertIsInstance(sel.xpath("//div"), MySelectorList) @@ -1123,9 +1065,9 @@ def test_drop_with_xml_type(self) -> None: el.drop() assert sel.get() == "" - def test_deep_nesting(self): - lxml_version = parse_version(etree.__version__) - lxml_huge_tree_version = parse_version("4.2") + def test_deep_nesting(self) -> None: + lxml_version = Version(etree.__version__) + lxml_huge_tree_version = Version("4.2") content = """ @@ -1194,6 +1136,81 @@ def test_deep_nesting(self): self.assertEqual(len(sel.css("span")), nest_level) self.assertEqual(len(sel.css("td")), 1) + def test_invalid_type(self) -> None: + with self.assertRaises(ValueError): + self.sscls("", type="xhtml") + + def test_default_type(self) -> None: + text = "foo" + selector = self.sscls(text) + self.assertEqual(selector.type, "html") + + def test_json_type(self) -> None: + obj = 1 + selector = self.sscls(str(obj), type="json") + self.assertEqual(selector.root, obj) + self.assertEqual(selector.type, "json") + + def test_html_root(self) -> None: + root = etree.fromstring("") + selector = self.sscls(root=root) + self.assertEqual(selector.root, root) + self.assertEqual(selector.type, "html") + + def test_json_root(self) -> None: + obj = 1 + selector = self.sscls(root=obj) + self.assertEqual(selector.root, obj) + self.assertEqual(selector.type, "json") + + def test_json_xpath(self) -> None: + obj = 1 + selector = self.sscls(root=obj) + with self.assertRaises(ValueError): + selector.xpath("//*") + + def test_json_css(self) -> None: + obj = 1 + selector = self.sscls(root=obj) + with self.assertRaises(ValueError): + selector.css("*") + + def test_invalid_json(self) -> None: + text = "" + selector = self.sscls(text, type="json") + self.assertEqual(selector.root, None) + self.assertEqual(selector.type, "json") + + def test_text_and_root_warning(self) -> None: + with warnings.catch_warnings(record=True) as w: + Selector(text="a", root="b") + self.assertIn("both text and root", str(w[0].message)) + + def test_etree_root_invalid_type(self) -> None: + selector = Selector("") + self.assertRaisesRegex( + ValueError, + "object as root", + Selector, + root=selector.root, + type="text", + ) + self.assertRaisesRegex( + ValueError, + "object as root", + Selector, + root=selector.root, + type="json", + ) + + def test_json_selector_representation(self) -> None: + selector = Selector(text="true") + assert repr(selector) == "" + assert str(selector) == "True" + selector = Selector(text="1") + assert repr(selector) == "" + assert str(selector) == "1" + class ExsltTestCase(unittest.TestCase): @@ -1214,30 +1231,18 @@ def test_regexp(self) -> None: # re:test() self.assertEqual( sel.xpath('//input[re:test(@name, "[A-Z]+", "i")]').extract(), - [ - x.extract() - for x in sel.xpath('//input[re:test(@name, "[A-Z]+", "i")]') - ], + [x.extract() for x in sel.xpath('//input[re:test(@name, "[A-Z]+", "i")]')], ) self.assertEqual( - [ - x.extract() - for x in sel.xpath(r'//a[re:test(@href, "\.html$")]/text()') - ], + [x.extract() for x in sel.xpath(r'//a[re:test(@href, "\.html$")]/text()')], ["first link", "second link"], ) self.assertEqual( - [ - x.extract() - for x in sel.xpath('//a[re:test(@href, "first")]/text()') - ], + [x.extract() for x in sel.xpath('//a[re:test(@href, "first")]/text()')], ["first link"], ) self.assertEqual( - [ - x.extract() - for x in sel.xpath('//a[re:test(@href, "second")]/text()') - ], + [x.extract() for x in sel.xpath('//a[re:test(@href, "second")]/text()')], ["second link"], ) @@ -1267,9 +1272,7 @@ def test_regexp(self) -> None: r're:replace(//a[re:test(@href, "\.xml$")]/@href,' r'"(\w+)://(.+)(\.xml)", "","https://\2.html")' ).extract(), - [ - "https://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.html" - ], + ["https://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.html"], ) def test_set(self) -> None: @@ -1359,9 +1362,9 @@ def test_text_get(self): self.assertEqual(txt, "title:\n\nsome text") def test_text_getall(self): - sel = self.sscls( - text="
    • option1
    • option2
    " - ).getall(text=True) + sel = self.sscls(text="
    • option1
    • option2
    ").getall( + text=True + ) self.assertEqual(1, len(sel)) self.assertEqual("option1\noption2", sel[0]) @@ -1399,3 +1402,57 @@ def test_text_xpath_get(self): html.xpath('//div[@class="product"]/span').getall(text=True), ["Price: 100", "Price: 200"], ) + + +class SelectorBytesInput(Selector): + def __init__( + self, + text: Optional[str] = None, + type: Optional[str] = None, + body: bytes = b"", + encoding: str = "utf8", + namespaces: Optional[Mapping[str, str]] = None, + root: Optional[Any] = _NOT_SET, + base_url: Optional[str] = None, + _expr: Optional[str] = None, + huge_tree: bool = LXML_SUPPORTS_HUGE_TREE, + ) -> None: + if text: + body = bytes(text, encoding=encoding) + text = None + super().__init__( + text=text, + type=type, + body=body, + encoding=encoding, + namespaces=namespaces, + root=root, + base_url=base_url, + _expr=_expr, + huge_tree=huge_tree, + ) + + +class SelectorTestCaseBytes(SelectorTestCase): + sscls = SelectorBytesInput + + def test_representation_slice(self) -> None: + pass + + def test_representation_unicode_query(self) -> None: + pass + + def test_weakref_slots(self) -> None: + pass + + def test_check_text_argument_type(self) -> None: + self.assertRaisesRegex( + TypeError, + "body argument should be of type", + self.sscls, + body="", + ) + + +class ExsltTestCaseBytes(ExsltTestCase): + sscls = SelectorBytesInput diff --git a/tests/test_selector_csstranslator.py b/tests/test_selector_csstranslator.py index e2934d17..2adc2f5e 100644 --- a/tests/test_selector_csstranslator.py +++ b/tests/test_selector_csstranslator.py @@ -1,17 +1,18 @@ """ Selector tests for cssselect backend """ + import unittest +from typing import Any, Callable, List, Protocol, Tuple, Type, Union import cssselect import pytest -from packaging.version import Version - -from parsel.csstranslator import GenericTranslator, HTMLTranslator -from parsel import Selector from cssselect.parser import SelectorSyntaxError from cssselect.xpath import ExpressionError +from packaging.version import Version +from parsel import Selector +from parsel.csstranslator import GenericTranslator, HTMLTranslator, TranslatorProtocol HTMLBODY = """ @@ -49,12 +50,32 @@ """ +class TranslatorTestProtocol(Protocol): + tr_cls: Type[TranslatorProtocol] + tr: TranslatorProtocol + + def c2x(self, css: str, prefix: str = ...) -> str: + pass + + def assertEqual(self, first: Any, second: Any, msg: Any = ...) -> None: + pass + + def assertRaises( + self, + expected_exception: Union[Type[BaseException], Tuple[Type[BaseException], ...]], + callable: Callable[..., object], + *args: Any, + **kwargs: Any, + ) -> None: + pass + + class TranslatorTestMixin: - def setUp(self): + def setUp(self: TranslatorTestProtocol) -> None: self.tr = self.tr_cls() self.c2x = self.tr.css_to_xpath - def test_attr_function(self): + def test_attr_function(self: TranslatorTestProtocol) -> None: cases = [ ("::attr(name)", "descendant-or-self::*/@name"), ("a::attr(href)", "descendant-or-self::a/@href"), @@ -67,7 +88,7 @@ def test_attr_function(self): for css, xpath in cases: self.assertEqual(self.c2x(css), xpath, css) - def test_attr_function_exception(self): + def test_attr_function_exception(self: TranslatorTestProtocol) -> None: cases = [ ("::attr(12)", ExpressionError), ("::attr(34test)", ExpressionError), @@ -76,7 +97,7 @@ def test_attr_function_exception(self): for css, exc in cases: self.assertRaises(exc, self.c2x, css) - def test_text_pseudo_element(self): + def test_text_pseudo_element(self: TranslatorTestProtocol) -> None: cases = [ ("::text", "descendant-or-self::text()"), ("p::text", "descendant-or-self::p/text()"), @@ -105,7 +126,7 @@ def test_text_pseudo_element(self): for css, xpath in cases: self.assertEqual(self.c2x(css), xpath, css) - def test_pseudo_function_exception(self): + def test_pseudo_function_exception(self: TranslatorTestProtocol) -> None: cases = [ ("::attribute(12)", ExpressionError), ("::text()", ExpressionError), @@ -114,14 +135,14 @@ def test_pseudo_function_exception(self): for css, exc in cases: self.assertRaises(exc, self.c2x, css) - def test_unknown_pseudo_element(self): + def test_unknown_pseudo_element(self: TranslatorTestProtocol) -> None: cases = [ ("::text-node", ExpressionError), ] for css, exc in cases: self.assertRaises(exc, self.c2x, css) - def test_unknown_pseudo_class(self): + def test_unknown_pseudo_class(self: TranslatorTestProtocol) -> None: cases = [ (":text", ExpressionError), (":attribute(name)", ExpressionError), @@ -139,7 +160,7 @@ class GenericTranslatorTest(TranslatorTestMixin, unittest.TestCase): class UtilCss2XPathTest(unittest.TestCase): - def test_css2xpath(self): + def test_css2xpath(self) -> None: from parsel import css2xpath expected_xpath = ( @@ -153,15 +174,13 @@ class CSSSelectorTest(unittest.TestCase): sscls = Selector - def setUp(self): + def setUp(self) -> None: self.sel = self.sscls(text=HTMLBODY) - def x(self, *a, **kw): - return [ - v.strip() for v in self.sel.css(*a, **kw).extract() if v.strip() - ] + def x(self, *a: Any, **kw: Any) -> List[str]: + return [v.strip() for v in self.sel.css(*a, **kw).extract() if v.strip()] - def test_selector_simple(self): + def test_selector_simple(self) -> None: for x in self.sel.css("input"): self.assertTrue(isinstance(x, self.sel.__class__), x) self.assertEqual( @@ -169,7 +188,7 @@ def test_selector_simple(self): [x.extract() for x in self.sel.css("input")], ) - def test_text_pseudo_element(self): + def test_text_pseudo_element(self) -> None: self.assertEqual(self.x("#p-b2"), ['guy']) self.assertEqual(self.x("#p-b2::text"), ["guy"]) self.assertEqual(self.x("#p-b2 ::text"), ["guy"]) @@ -179,11 +198,9 @@ def test_text_pseudo_element(self): ["lorem ipsum text", "hi", "there", "guy"], ) self.assertEqual(self.x("p::text"), ["lorem ipsum text"]) - self.assertEqual( - self.x("p ::text"), ["lorem ipsum text", "hi", "there", "guy"] - ) + self.assertEqual(self.x("p ::text"), ["lorem ipsum text", "hi", "there", "guy"]) - def test_attribute_function(self): + def test_attribute_function(self) -> None: self.assertEqual(self.x("#p-b2::attr(id)"), ["p-b2"]) self.assertEqual(self.x(".cool-footer::attr(class)"), ["cool-footer"]) self.assertEqual( @@ -193,10 +210,8 @@ def test_attribute_function(self): self.x('map[name="dummymap"] ::attr(shape)'), ["circle", "default"] ) - def test_nested_selector(self): - self.assertEqual( - self.sel.css("p").css("b::text").extract(), ["hi", "guy"] - ) + def test_nested_selector(self) -> None: + self.assertEqual(self.sel.css("p").css("b::text").extract(), ["hi", "guy"]) self.assertEqual( self.sel.css("div").css("area:last-child").extract(), [''], @@ -206,5 +221,10 @@ def test_nested_selector(self): Version(cssselect.__version__) < Version("1.2.0"), reason="Support added in cssselect 1.2.0", ) - def test_pseudoclass_has(self): + def test_pseudoclass_has(self) -> None: self.assertEqual(self.x("p:has(b)::text"), ["lorem ipsum text"]) + + +class CSSSelectorTestBytes(CSSSelectorTest): + def setUp(self) -> None: + self.sel = self.sscls(body=bytes(HTMLBODY, encoding="utf8")) diff --git a/tests/test_selector_jmespath.py b/tests/test_selector_jmespath.py new file mode 100644 index 00000000..5afbd6d0 --- /dev/null +++ b/tests/test_selector_jmespath.py @@ -0,0 +1,170 @@ +# -*- coding: utf-8 -*- + +import unittest + +from parsel import Selector +from parsel.selector import _NOT_SET + + +class JMESPathTestCase(unittest.TestCase): + def test_json_has_html(self) -> None: + """Sometimes the information is returned in a json wrapper""" + data = """ + { + "content": [ + { + "name": "A", + "value": "a" + }, + { + "name": { + "age": 18 + }, + "value": "b" + }, + { + "name": "C", + "value": "c" + }, + { + "name": "D", + "value": "
    d
    " + } + ], + "html": "
    def
    " + } + """ + sel = Selector(text=data) + self.assertEqual( + sel.jmespath("html").get(), + "
    def
    ", + ) + self.assertEqual( + sel.jmespath("html").xpath("//div/a/text()").getall(), + ["a", "b", "d"], + ) + self.assertEqual(sel.jmespath("html").css("div > b").getall(), ["f"]) + self.assertEqual(sel.jmespath("content").jmespath("name.age").get(), 18) + + def test_html_has_json(self) -> None: + html_text = """ +
    +

    Information

    + + { + "user": [ + { + "name": "A", + "age": 18 + }, + { + "name": "B", + "age": 32 + }, + { + "name": "C", + "age": 22 + }, + { + "name": "D", + "age": 25 + } + ], + "total": 4, + "status": "ok" + } + +
    + """ + sel = Selector(text=html_text) + self.assertEqual( + sel.xpath("//div/content/text()").jmespath("user[*].name").getall(), + ["A", "B", "C", "D"], + ) + self.assertEqual( + sel.xpath("//div/content").jmespath("user[*].name").getall(), + ["A", "B", "C", "D"], + ) + self.assertEqual(sel.xpath("//div/content").jmespath("total").get(), 4) + + def test_jmestpath_with_re(self) -> None: + html_text = """ +
    +

    Information

    + + { + "user": [ + { + "name": "A", + "age": 18 + }, + { + "name": "B", + "age": 32 + }, + { + "name": "C", + "age": 22 + }, + { + "name": "D", + "age": 25 + } + ], + "total": 4, + "status": "ok" + } + +
    + """ + sel = Selector(text=html_text) + self.assertEqual( + sel.xpath("//div/content/text()").jmespath("user[*].name").re(r"(\w+)"), + ["A", "B", "C", "D"], + ) + self.assertEqual( + sel.xpath("//div/content").jmespath("user[*].name").re(r"(\w+)"), + ["A", "B", "C", "D"], + ) + + with self.assertRaises(TypeError): + sel.xpath("//div/content").jmespath("user[*].age").re(r"(\d+)") + + self.assertEqual( + sel.xpath("//div/content").jmespath("unavailable").re(r"(\d+)"), [] + ) + + self.assertEqual( + sel.xpath("//div/content").jmespath("unavailable").re_first(r"(\d+)"), + None, + ) + + self.assertEqual( + sel.xpath("//div/content") + .jmespath("user[*].age.to_string(@)") + .re(r"(\d+)"), + ["18", "32", "22", "25"], + ) + + def test_json_types(self) -> None: + for text, root in ( + ("{}", {}), + ('{"a": "b"}', {"a": "b"}), + ("[]", []), + ('["a"]', ["a"]), + ('""', ""), + ("0", 0), + ("1", 1), + ("true", True), + ("false", False), + ("null", None), + ): + selector = Selector(text=text, root=_NOT_SET) + self.assertEqual(selector.type, "json") + self.assertEqual(selector._text, text) # pylint: disable=protected-access + self.assertEqual(selector.root, root) + + selector = Selector(text=None, root=root) + self.assertEqual(selector.type, "json") + self.assertEqual(selector._text, None) # pylint: disable=protected-access + self.assertEqual(selector.root, root) diff --git a/tests/test_utils.py b/tests/test_utils.py index e2bca559..ee3e1121 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,7 +1,9 @@ -from parsel.utils import shorten, extract_regex +from typing import List, Pattern, Type, Union from pytest import mark, raises +from parsel.utils import extract_regex, shorten + @mark.parametrize( "width,expected", @@ -17,7 +19,7 @@ (7, "foobar"), ), ) -def test_shorten(width, expected): +def test_shorten(width: int, expected: Union[str, Type[Exception]]) -> None: if isinstance(expected, str): assert shorten("foobar", width) == expected else: @@ -66,5 +68,10 @@ def test_shorten(width, expected): ], ), ) -def test_extract_regex(regex, text, replace_entities, expected): +def test_extract_regex( + regex: Union[str, Pattern[str]], + text: str, + replace_entities: bool, + expected: List[str], +) -> None: assert extract_regex(regex, text, replace_entities) == expected diff --git a/tests/test_xml_attacks.py b/tests/test_xml_attacks.py index 45b0243a..e7b5a486 100644 --- a/tests/test_xml_attacks.py +++ b/tests/test_xml_attacks.py @@ -7,11 +7,10 @@ from parsel import Selector - MiB_1 = 1024**2 -def _load(attack): +def _load(attack: str) -> str: folder_path = path.dirname(__file__) file_path = path.join(folder_path, "xml_attacks", f"{attack}.xml") with open(file_path, "rb") as attack_file: @@ -21,7 +20,7 @@ def _load(attack): # List of known attacks: # https://github.com/tiran/defusedxml#python-xml-libraries class XMLAttackTestCase(TestCase): - def test_billion_laughs(self): + def test_billion_laughs(self) -> None: process = Process() memory_usage_before = process.memory_info().rss selector = Selector(text=_load("billion_laughs")) diff --git a/tests/test_xpathfuncs.py b/tests/test_xpathfuncs.py index 744472a9..7739982d 100644 --- a/tests/test_xpathfuncs.py +++ b/tests/test_xpathfuncs.py @@ -1,10 +1,12 @@ +import unittest +from typing import Any + from parsel import Selector from parsel.xpathfuncs import set_xpathfunc -import unittest class XPathFuncsTestCase(unittest.TestCase): - def test_has_class_simple(self): + def test_has_class_simple(self) -> None: body = """

    First

    Second

    @@ -21,21 +23,15 @@ def test_has_class_simple(self): ["Third"], ) self.assertEqual( - [ - x.extract() - for x in sel.xpath('//p[has-class("foo","bar")]/text()') - ], + [x.extract() for x in sel.xpath('//p[has-class("foo","bar")]/text()')], [], ) self.assertEqual( - [ - x.extract() - for x in sel.xpath('//p[has-class("foo","bar-baz")]/text()') - ], + [x.extract() for x in sel.xpath('//p[has-class("foo","bar-baz")]/text()')], ["First"], ) - def test_has_class_error_no_args(self): + def test_has_class_error_no_args(self) -> None: body = """

    First

    """ @@ -47,7 +43,7 @@ def test_has_class_error_no_args(self): "has-class()", ) - def test_has_class_error_invalid_arg_type(self): + def test_has_class_error_invalid_arg_type(self) -> None: body = """

    First

    """ @@ -59,7 +55,7 @@ def test_has_class_error_invalid_arg_type(self): "has-class(.)", ) - def test_has_class_error_invalid_unicode(self): + def test_has_class_error_invalid_unicode(self) -> None: body = """

    First

    """ @@ -71,7 +67,7 @@ def test_has_class_error_invalid_unicode(self): 'has-class("héllö")'.encode(), ) - def test_has_class_unicode(self): + def test_has_class_unicode(self) -> None: body = """

    First

    """ @@ -81,7 +77,7 @@ def test_has_class_unicode(self): ["First"], ) - def test_has_class_uppercase(self): + def test_has_class_uppercase(self) -> None: body = """

    First

    """ @@ -91,7 +87,7 @@ def test_has_class_uppercase(self): ["First"], ) - def test_has_class_newline(self): + def test_has_class_newline(self) -> None: body = """

    First

    @@ -102,7 +98,7 @@ def test_has_class_newline(self): ["First"], ) - def test_has_class_tab(self): + def test_has_class_tab(self) -> None: body = """

    First

    """ @@ -112,11 +108,11 @@ def test_has_class_tab(self): ["First"], ) - def test_set_xpathfunc(self): - def myfunc(ctx): - myfunc.call_count += 1 + def test_set_xpathfunc(self) -> None: + def myfunc(ctx: Any) -> None: + myfunc.call_count += 1 # type: ignore[attr-defined] - myfunc.call_count = 0 + myfunc.call_count = 0 # type: ignore[attr-defined] body = """

    First

    @@ -131,7 +127,7 @@ def myfunc(ctx): set_xpathfunc("myfunc", myfunc) sel.xpath("myfunc()") - self.assertEqual(myfunc.call_count, 1) + self.assertEqual(myfunc.call_count, 1) # type: ignore[attr-defined] set_xpathfunc("myfunc", None) self.assertRaisesRegex( diff --git a/tests/typing/selector.py b/tests/typing/selector.py index 72310634..b6f14345 100644 --- a/tests/typing/selector.py +++ b/tests/typing/selector.py @@ -1,6 +1,7 @@ # Basic usage of the Selector, strongly typed to test the typing of parsel's API. import re from typing import List + from parsel import Selector diff --git a/tox.ini b/tox.ini index e5e5f47e..893b8de7 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = security,flake8,typing,pylint,black,docs,py37,py38,py39,py310,pypy3.9 +envlist = typing,pylint,docs,twinecheck,pre-commit,py38,py39,py310,py311,py312,pypy3.9,pypy3.10 [testenv] usedevelop = True @@ -7,42 +7,25 @@ deps = -r{toxinidir}/tests/requirements.txt commands = py.test --cov=parsel --cov-report=xml {posargs:docs parsel tests} -[testenv:security] -deps = - bandit -commands = - bandit -r -c .bandit.yml {posargs:parsel} - -[testenv:flake8] -deps = - {[testenv]deps} - flake8==5.0.4 -commands = - flake8 {posargs: parsel tests setup.py} - [testenv:typing] deps = {[testenv]deps} - types-lxml==2022.4.10 - types-psutil==5.9.5.4 - types-setuptools==65.5.0.1 - mypy==0.982 + types-jmespath==1.0.2.20240106 + types-lxml==2024.2.9 + types-psutil==5.9.5.20240311 + types-setuptools==69.1.0.20240310 + py==1.11.0 + mypy==1.9.0 commands = - mypy {posargs:parsel tests} --warn-unused-ignores --show-error-codes + mypy {posargs:parsel tests} --strict [testenv:pylint] deps = {[testenv]deps} - pylint==2.15.4 + pylint==3.1.0 commands = pylint docs parsel tests setup.py -[testenv:black] -deps = - black==22.10.0 -commands = - black --line-length=79 --check {posargs:parsel tests setup.py} - [docs] changedir = docs deps = -rdocs/requirements.txt @@ -55,4 +38,18 @@ deps = {[docs]deps} commands = sphinx-build -W -b html . {envtmpdir}/html sphinx-build -b latex . {envtmpdir}/latex - sphinx-build -W -b epub . {envtmpdir}/epub + sphinx-build -b epub . {envtmpdir}/epub + +[testenv:twinecheck] +basepython = python3 +deps = + twine==5.0.0 + build==1.1.1 +commands = + python -m build --sdist + twine check dist/* + +[testenv:pre-commit] +deps = pre-commit +commands = pre-commit run --all-files --show-diff-on-failure +skip_install = true