diff --git a/.bandit.yml b/.bandit.yml
index f4d993c2..4f60a02f 100644
--- a/.bandit.yml
+++ b/.bandit.yml
@@ -1,3 +1,6 @@
skips:
+- B101
+- B311
- B320
- B410
+exclude_dirs: ['tests']
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
index a0e930d9..adaa9807 100644
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 1.7.0
+current_version = 1.9.1
commit = True
tag = True
tag_name = v{new_version}
diff --git a/.coveragerc b/.coveragerc
index a0e59ef0..ba07b2fb 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,6 +1,5 @@
[run]
branch = true
-include = parsel/*
[report]
exclude_lines =
diff --git a/.flake8 b/.flake8
index fe2937e4..7e5efc63 100644
--- a/.flake8
+++ b/.flake8
@@ -1,5 +1,5 @@
[flake8]
-ignore = E203
+ignore = E203,W503
per-file-ignores =
docs/conftest.py:E501
parsel/csstranslator.py:E501
@@ -9,6 +9,7 @@ per-file-ignores =
setup.py:E501
tests/test_selector.py:E501
tests/test_selector_csstranslator.py:E501
+ tests/test_selector_jmespath.py:E501
tests/test_utils.py:E501
tests/test_xpathfuncs.py:E501
tests/typing/*.py:E,F
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
new file mode 100644
index 00000000..00d5546f
--- /dev/null
+++ b/.git-blame-ignore-revs
@@ -0,0 +1,2 @@
+# applying pre-commit hooks to the project
+a57c23e3b7be0f001595bd8767fe05e40a66e730
\ No newline at end of file
diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
index ef4487e0..b06e7901 100644
--- a/.github/workflows/checks.yml
+++ b/.github/workflows/checks.yml
@@ -8,30 +8,27 @@ jobs:
fail-fast: false
matrix:
include:
- - python-version: "3.11"
+ - python-version: "3.12"
env:
- TOXENV: security
- - python-version: "3.11"
- env:
- TOXENV: flake8
- - python-version: "3.11"
+ TOXENV: pre-commit
+ - python-version: "3.12"
env:
TOXENV: pylint
- - python-version: "3.11" # Keep in sync with .readthedocs.yml
+ - python-version: "3.12"
env:
TOXENV: docs
- - python-version: "3.11"
+ - python-version: "3.12"
env:
TOXENV: typing
- - python-version: "3.11"
+ - python-version: "3.12"
env:
- TOXENV: black
+ TOXENV: twinecheck
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v2
+ uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index c4b017d9..ebbb8dff 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -7,12 +7,12 @@ jobs:
if: startsWith(github.event.ref, 'refs/tags/')
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
- - name: Set up Python 3.10
- uses: actions/setup-python@v4
+ - name: Set up Python 3.12
+ uses: actions/setup-python@v5
with:
- python-version: "3.10"
+ python-version: "3.12"
- name: Check Tag
id: check-release-tag
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index b86ab67b..eb2561d0 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -8,9 +8,6 @@ jobs:
fail-fast: false
matrix:
include:
- - python-version: "3.7"
- env:
- TOXENV: py
- python-version: "3.8"
env:
TOXENV: py
@@ -23,21 +20,24 @@ jobs:
- python-version: "3.11"
env:
TOXENV: py
- - python-version: pypy3.9
+ - python-version: "3.12"
+ env:
+ TOXENV: py
+ - python-version: pypy3.10
env:
TOXENV: pypy3
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
- name: Install system libraries
- if: contains(matrix.python-version, 'pypy3.9')
+ if: contains(matrix.python-version, 'pypy')
run: |
sudo apt-get update
sudo apt-get install libxml2-dev libxslt-dev
- name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v4
+ uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
diff --git a/.gitignore b/.gitignore
index 8d344f16..20dec10b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,6 +24,7 @@ pip-log.txt
# Unit test / coverage reports
.coverage
+/coverage.xml
.tox
nosetests.xml
htmlcov
diff --git a/.isort.cfg b/.isort.cfg
new file mode 100644
index 00000000..6860bdb0
--- /dev/null
+++ b/.isort.cfg
@@ -0,0 +1,2 @@
+[settings]
+profile = black
\ No newline at end of file
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..db43480a
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,18 @@
+repos:
+- repo: https://github.com/PyCQA/bandit
+ rev: 1.7.8
+ hooks:
+ - id: bandit
+ args: [-r, -c, .bandit.yml]
+- repo: https://github.com/PyCQA/flake8
+ rev: 7.0.0
+ hooks:
+ - id: flake8
+- repo: https://github.com/psf/black.git
+ rev: 24.2.0
+ hooks:
+ - id: black
+- repo: https://github.com/pycqa/isort
+ rev: 5.13.2
+ hooks:
+ - id: isort
diff --git a/.readthedocs.yml b/.readthedocs.yml
index a6f8c799..d4f39082 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -8,7 +8,7 @@ build:
tools:
# For available versions, see:
# https://docs.readthedocs.io/en/stable/config-file/v2.html#build-tools-python
- python: "3.11" # Keep in sync with .github/workflows/checks.yml
+ python: "3.12" # Keep in sync with .github/workflows/checks.yml
python:
install:
- requirements: docs/requirements.txt
diff --git a/Makefile b/Makefile
deleted file mode 100644
index 2a83dc87..00000000
--- a/Makefile
+++ /dev/null
@@ -1,74 +0,0 @@
-.PHONY: clean-pyc clean-build docs clean
-
-help:
- @echo "clean - remove all build, test, coverage and Python artifacts"
- @echo "clean-build - remove build artifacts"
- @echo "clean-pyc - remove Python file artifacts"
- @echo "clean-test - remove test and coverage artifacts"
- @echo "lint - check style with flake8"
- @echo "test - run tests quickly with the default Python"
- @echo "test-all - run tests on every Python version with tox"
- @echo "coverage - check code coverage quickly with the default Python"
- @echo "docs - generate Sphinx HTML documentation, including API docs"
- @echo "release - package and upload a release"
- @echo "dist - package"
- @echo "install - install the package to the active Python's site-packages"
-
-clean: clean-build clean-pyc clean-test
-
-clean-build:
- rm -fr build/
- rm -fr dist/
- rm -fr .eggs/
- find . -name '*.egg-info' -exec rm -fr {} +
- find . -name '*.egg' -exec rm -f {} +
-
-clean-pyc:
- find . -name '*.pyc' -exec rm -f {} +
- find . -name '*.pyo' -exec rm -f {} +
- find . -name '*~' -exec rm -f {} +
- find . -name '__pycache__' -exec rm -fr {} +
-
-clean-test:
- rm -fr .tox/
- rm -f .coverage
- rm -fr htmlcov/
-
-lint:
- flake8 parsel tests
-
-test:
- nosetests --with-doctest --rednose -s -v
-
-test-all:
- tox
-
-coverage:
- coverage run --source parsel setup.py test
- coverage report -m
- coverage html
- python -m webbrowser htmlcov/index.html
-
-docs:
- ( python -c 'import sphinx_rtd_theme' 2>/dev/null || pip install sphinx_rtd_theme )
- rm -f docs/parsel.rst
- rm -f docs/modules.rst
- sphinx-apidoc -o docs/ parsel
- $(MAKE) -C docs clean
- $(MAKE) -C docs html
- python -m webbrowser docs/_build/html/index.html
-
-servedocs: docs
- watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D docs/
-
-release: clean
- python setup.py sdist upload
- python setup.py bdist_wheel upload
-
-dist: clean
- python setup.py sdist
- python setup.py bdist_wheel
- ls -l dist
-
-install: clean
- python setup.py install
diff --git a/NEWS b/NEWS
index 2b6f41be..034727b9 100644
--- a/NEWS
+++ b/NEWS
@@ -3,6 +3,51 @@
History
-------
+1.9.1 (2024-04-08)
+~~~~~~~~~~~~~~~~~~
+
+* Removed the dependency on ``pytest-runner``.
+* Removed the obsolete ``Makefile``.
+
+1.9.0 (2024-03-14)
+~~~~~~~~~~~~~~~~~~
+
+* Now requires ``cssselect >= 1.2.0`` (this minimum version was required since
+ 1.8.0 but that wasn't properly recorded)
+* Removed support for Python 3.7
+* Added support for Python 3.12 and PyPy 3.10
+* Fixed an exception when calling ``__str__`` or ``__repr__`` on some JSON
+ selectors
+* Code formatted with ``black``
+* CI fixes and improvements
+
+1.8.1 (2023-04-18)
+~~~~~~~~~~~~~~~~~~
+
+* Remove a Sphinx reference from NEWS to fix the PyPI description
+* Add a ``twine check`` CI check to detect such problems
+
+1.8.0 (2023-04-18)
+~~~~~~~~~~~~~~~~~~
+
+* Add support for JMESPath: you can now create a selector for a JSON document
+ and call ``Selector.jmespath()``. See `the documentation`_ for more
+ information and examples.
+* Selectors can now be constructed from ``bytes`` (using the ``body`` and
+ ``encoding`` arguments) instead of ``str`` (using the ``text`` argument), so
+ that there is no internal conversion from ``str`` to ``bytes`` and the memory
+ usage is lower.
+* Typing improvements
+* The ``pkg_resources`` module (which was absent from the requirements) is no
+ longer used
+* Documentation build fixes
+* New requirements:
+
+ * ``jmespath``
+ * ``typing_extensions`` (on Python 3.7)
+
+ .. _the documentation: https://parsel.readthedocs.io/en/latest/usage.html
+
1.7.0 (2022-11-01)
~~~~~~~~~~~~~~~~~~
diff --git a/README.rst b/README.rst
index d5309ab9..7fdc75e0 100644
--- a/README.rst
+++ b/README.rst
@@ -19,9 +19,16 @@ Parsel
:alt: Coverage report
-Parsel is a BSD-licensed Python_ library to extract and remove data from HTML_
-and XML_ using XPath_ and CSS_ selectors, optionally combined with
-`regular expressions`_.
+Parsel is a BSD-licensed Python_ library to extract data from HTML_, JSON_, and
+XML_ documents.
+
+It supports:
+
+- CSS_ and XPath_ expressions for HTML and XML documents
+
+- JMESPath_ expressions for JSON documents
+
+- `Regular expressions`_
Find the Parsel online documentation at https://parsel.readthedocs.org.
@@ -30,15 +37,18 @@ Example (`open online demo`_):
.. code-block:: python
>>> from parsel import Selector
- >>> selector = Selector(text="""
-
- Hello, Parsel!
-
-
- """)
+ >>> text = """
+
+
+ Hello, Parsel!
+
+
+
+ """
+ >>> selector = Selector(text=text)
>>> selector.css('h1::text').get()
'Hello, Parsel!'
>>> selector.xpath('//h1/text()').re(r'\w+')
@@ -47,12 +57,18 @@ Example (`open online demo`_):
... print(li.xpath('.//@href').get())
http://example.com
http://scrapy.org
-
+ >>> selector.css('script::text').jmespath("a").get()
+ 'b'
+ >>> selector.css('script::text').jmespath("a").getall()
+ ['b', 'c']
.. _CSS: https://en.wikipedia.org/wiki/Cascading_Style_Sheets
.. _HTML: https://en.wikipedia.org/wiki/HTML
+.. _JMESPath: https://jmespath.org/
+.. _JSON: https://en.wikipedia.org/wiki/JSON
.. _open online demo: https://colab.research.google.com/drive/149VFa6Px3wg7S3SEnUqk--TyBrKplxCN#forceEdit=true&sandboxMode=true
.. _Python: https://www.python.org/
.. _regular expressions: https://docs.python.org/library/re.html
.. _XML: https://en.wikipedia.org/wiki/XML
.. _XPath: https://en.wikipedia.org/wiki/XPath
+
diff --git a/docs/conf.py b/docs/conf.py
index 3e877e91..4d7b0d63 100755
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -3,9 +3,6 @@
import os
import sys
-import parsel
-
-
# Get the project root dir, which is the parent dir of this
cwd = os.getcwd()
project_root = os.path.dirname(cwd)
@@ -15,6 +12,7 @@
# version is used.
sys.path.insert(0, project_root)
+import parsel # noqa: E402
# -- General configuration ---------------------------------------------
@@ -98,10 +96,9 @@
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
- ("index", "parsel", "Parsel Documentation", ["Scrapy Project"], 1)
+ ("index", "parsel", "Parsel Documentation", ["Scrapy Project"], 1),
]
-
# -- Options for Texinfo output ----------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
@@ -134,6 +131,8 @@
# nitpicky = True # https://github.com/scrapy/cssselect/pull/110
nitpick_ignore = [
+ ("py:class", "ExpressionError"),
+ ("py:class", "SelectorSyntaxError"),
("py:class", "cssselect.xpath.GenericTranslator"),
("py:class", "cssselect.xpath.HTMLTranslator"),
("py:class", "cssselect.xpath.XPathExpr"),
diff --git a/docs/usage.rst b/docs/usage.rst
index d2c08cd6..7cfa2fce 100644
--- a/docs/usage.rst
+++ b/docs/usage.rst
@@ -4,32 +4,38 @@
Usage
=====
-Create a :class:`~parsel.selector.Selector` object for the HTML or XML text
-that you want to parse::
+Create a :class:`~parsel.selector.Selector` object for your input text.
+
+For HTML or XML, use `CSS`_ or `XPath`_ expressions to select data::
>>> from parsel import Selector
- >>> text = "Hello, Parsel!
"
- >>> selector = Selector(text=text)
+ >>> html_text = "Hello, Parsel!
"
+ >>> html_selector = Selector(text=html_text)
+ >>> html_selector.css('h1')
+ []
+ >>> html_selector.xpath('//h1') # the same, but now with XPath
+ []
-Then use `CSS`_ or `XPath`_ expressions to select elements::
+For JSON, use `JMESPath`_ expressions to select data::
- >>> selector.css('h1')
- []
- >>> selector.xpath('//h1') # the same, but now with XPath
- []
+ >>> json_text = '{"title":"Hello, Parsel!"}'
+ >>> json_selector = Selector(text=json_text)
+ >>> json_selector.jmespath('title')
+ []
And extract data from those elements::
- >>> selector.css('h1::text').get()
+ >>> html_selector.xpath('//h1/text()').get()
'Hello, Parsel!'
- >>> selector.xpath('//h1/text()').getall()
+ >>> json_selector.jmespath('title').getall()
['Hello, Parsel!']
.. _CSS: https://www.w3.org/TR/selectors
.. _XPath: https://www.w3.org/TR/xpath
+.. _JMESPath: https://jmespath.org/
-Learning CSS and XPath
-======================
+Learning expression languages
+=============================
`CSS`_ is a language for applying styles to HTML documents. It defines
selectors to associate those styles with specific HTML elements. Resources to
@@ -39,6 +45,11 @@ learn CSS_ selectors include:
- `XPath/CSS Equivalents in Wikibooks`_
+Parsel support for CSS selectors comes from cssselect, so read about `CSS
+selectors supported by cssselect`_.
+
+.. _CSS selectors supported by cssselect: https://cssselect.readthedocs.io/en/latest/#supported-selectors
+
`XPath`_ is a language for selecting nodes in XML documents, which can also be
used with HTML. Resources to learn XPath_ include:
@@ -46,13 +57,22 @@ used with HTML. Resources to learn XPath_ include:
- `XPath cheatsheet`_
-You can use either CSS_ or XPath_. CSS_ is usually more readable, but some
-things can only be done with XPath_.
+For HTML and XML input, you can use either CSS_ or XPath_. CSS_ is usually
+more readable, but some things can only be done with XPath_.
+
+JMESPath_ allows you to declaratively specify how to extract elements from
+a JSON document. Resources to learn JMESPath_ include:
+
+- `JMESPath Tutorial`_
+
+- `JMESPath Specification`_
.. _CSS selectors in the MDN: https://developer.mozilla.org/en-US/docs/Learn/CSS/Building_blocks/Selectors
.. _XPath cheatsheet: https://devhints.io/xpath
.. _XPath Tutorial in W3Schools: https://www.w3schools.com/xml/xpath_intro.asp
.. _XPath/CSS Equivalents in Wikibooks: https://en.wikibooks.org/wiki/XPath/CSS_Equivalents
+.. _JMESPath Tutorial: https://jmespath.org/tutorial.html
+.. _JMESPath Specification: https://jmespath.org/specification.html
Using selectors
@@ -95,12 +115,12 @@ So, by looking at the :ref:`HTML code ` of that
page, let's construct an XPath for selecting the text inside the title tag::
>>> selector.xpath('//title/text()')
- []
+ []
You can also ask the same thing using CSS instead::
>>> selector.css('title::text')
- []
+ []
To actually extract the textual data, you must call the selector ``.get()``
or ``.getall()`` methods, as follows::
@@ -124,10 +144,10 @@ To extract all text of one or more element and all their child elements,
formatted as plain text taking into account HTML tags (e.g. ``
`` is
translated as a line break), set ``text=True`` in your call to
:meth:`~Selector.get` or :meth:`~Selector.getall` instead of including
-``::text`` (CSS) or ``/text()`` (XPath) in your query:
+``::text`` (CSS) or ``/text()`` (XPath) in your query::
->>> selector.css('#images').get(text=True)
-'Name: My image 1\nName: My image 2\nName: My image 3\nName: My image 4\nName: My image 5'
+ >>> selector.css('#images').get(text=True)
+ 'Name: My image 1\nName: My image 2\nName: My image 3\nName: My image 4\nName: My image 5'
See :meth:`Selector.get` for additional parameters that you can use to change
how the extracted plain text is formatted.
@@ -609,10 +629,10 @@ returns ``True`` for nodes that have all of the specified HTML classes::
... """)
...
>>> sel.xpath('//p[has-class("foo")]')
- [,
- ]
+ [,
+ ]
>>> sel.xpath('//p[has-class("foo", "bar-baz")]')
- []
+ []
>>> sel.xpath('//p[has-class("foo", "bar")]')
[]
@@ -1023,8 +1043,8 @@ directly by their names::
>>> sel.remove_namespaces()
>>> sel.xpath("//link")
- [,
- ,
+ [,
+ ,
...]
If you wonder why the namespace removal procedure isn't called always by default
@@ -1069,8 +1089,8 @@ And try to select the links again, now using an "atom:" prefix
for the "link" node test::
>>> sel.xpath("//atom:link", namespaces={"atom": "http://www.w3.org/2005/Atom"})
- [,
- ,
+ [,
+ ,
...]
You can pass several namespaces (here we're using shorter 1-letter prefixes)::
diff --git a/parsel/__init__.py b/parsel/__init__.py
index a0d7f7cc..5fdbf350 100644
--- a/parsel/__init__.py
+++ b/parsel/__init__.py
@@ -5,7 +5,7 @@
__author__ = "Scrapy project"
__email__ = "info@scrapy.org"
-__version__ = "1.7.0"
+__version__ = "1.9.1"
__all__ = [
"Selector",
"SelectorList",
@@ -13,8 +13,8 @@
"xpathfuncs",
]
-from parsel.selector import Selector, SelectorList # NOQA
-from parsel.csstranslator import css2xpath # NOQA
from parsel import xpathfuncs # NOQA
+from parsel.csstranslator import css2xpath # NOQA
+from parsel.selector import Selector, SelectorList # NOQA
xpathfuncs.setup()
diff --git a/parsel/csstranslator.py b/parsel/csstranslator.py
index c240e6ac..80bfc7cf 100644
--- a/parsel/csstranslator.py
+++ b/parsel/csstranslator.py
@@ -1,27 +1,35 @@
from functools import lru_cache
+from typing import TYPE_CHECKING, Any, Optional, Protocol
from cssselect import GenericTranslator as OriginalGenericTranslator
from cssselect import HTMLTranslator as OriginalHTMLTranslator
-from cssselect.xpath import XPathExpr as OriginalXPathExpr
+from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
from cssselect.xpath import ExpressionError
-from cssselect.parser import FunctionalPseudoElement
+from cssselect.xpath import XPathExpr as OriginalXPathExpr
+
+if TYPE_CHECKING:
+ # typing.Self requires Python 3.11
+ from typing_extensions import Self
class XPathExpr(OriginalXPathExpr):
- textnode = False
- attribute = None
+ textnode: bool = False
+ attribute: Optional[str] = None
@classmethod
- def from_xpath(cls, xpath, textnode=False, attribute=None):
- x = cls(
- path=xpath.path, element=xpath.element, condition=xpath.condition
- )
+ def from_xpath(
+ cls,
+ xpath: OriginalXPathExpr,
+ textnode: bool = False,
+ attribute: Optional[str] = None,
+ ) -> "Self":
+ x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
x.textnode = textnode
x.attribute = attribute
return x
- def __str__(self):
+ def __str__(self) -> str:
path = super().__str__()
if self.textnode:
if path == "*":
@@ -38,38 +46,63 @@ def __str__(self):
return path
- def join(self, combiner, other, *args, **kwargs):
+ def join(
+ self: "Self",
+ combiner: str,
+ other: OriginalXPathExpr,
+ *args: Any,
+ **kwargs: Any,
+ ) -> "Self":
+ if not isinstance(other, XPathExpr):
+ raise ValueError(
+ f"Expressions of type {__name__}.XPathExpr can ony join expressions"
+ f" of the same type (or its descendants), got {type(other)}"
+ )
super().join(combiner, other, *args, **kwargs)
self.textnode = other.textnode
self.attribute = other.attribute
return self
+# e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
+class TranslatorProtocol(Protocol):
+ def xpath_element(self, selector: Element) -> OriginalXPathExpr:
+ pass
+
+ def css_to_xpath(self, css: str, prefix: str = ...) -> str:
+ pass
+
+
class TranslatorMixin:
"""This mixin adds support to CSS pseudo elements via dynamic dispatch.
Currently supported pseudo-elements are ``::text`` and ``::attr(ATTR_NAME)``.
"""
- def xpath_element(self, selector):
- xpath = super().xpath_element(selector)
+ def xpath_element(self: TranslatorProtocol, selector: Element) -> XPathExpr:
+ # https://github.com/python/mypy/issues/12344
+ xpath = super().xpath_element(selector) # type: ignore[safe-super]
return XPathExpr.from_xpath(xpath)
- def xpath_pseudo_element(self, xpath, pseudo_element):
+ def xpath_pseudo_element(
+ self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement
+ ) -> OriginalXPathExpr:
"""
Dispatch method that transforms XPath to support pseudo-element
"""
if isinstance(pseudo_element, FunctionalPseudoElement):
- method = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element"
- method = getattr(self, method, None)
+ method_name = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element"
+ method = getattr(self, method_name, None)
if not method:
raise ExpressionError(
f"The functional pseudo-element ::{pseudo_element.name}() is unknown"
)
xpath = method(xpath, pseudo_element)
else:
- method = f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
- method = getattr(self, method, None)
+ method_name = (
+ f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
+ )
+ method = getattr(self, method_name, None)
if not method:
raise ExpressionError(
f"The pseudo-element ::{pseudo_element} is unknown"
@@ -77,36 +110,36 @@ def xpath_pseudo_element(self, xpath, pseudo_element):
xpath = method(xpath)
return xpath
- def xpath_attr_functional_pseudo_element(self, xpath, function):
+ def xpath_attr_functional_pseudo_element(
+ self, xpath: OriginalXPathExpr, function: FunctionalPseudoElement
+ ) -> XPathExpr:
"""Support selecting attribute values using ::attr() pseudo-element"""
if function.argument_types() not in (["STRING"], ["IDENT"]):
raise ExpressionError(
f"Expected a single string or ident for ::attr(), got {function.arguments!r}"
)
- return XPathExpr.from_xpath(
- xpath, attribute=function.arguments[0].value
- )
+ return XPathExpr.from_xpath(xpath, attribute=function.arguments[0].value)
- def xpath_text_simple_pseudo_element(self, xpath):
+ def xpath_text_simple_pseudo_element(self, xpath: OriginalXPathExpr) -> XPathExpr:
"""Support selecting text nodes using ::text pseudo-element"""
return XPathExpr.from_xpath(xpath, textnode=True)
class GenericTranslator(TranslatorMixin, OriginalGenericTranslator):
@lru_cache(maxsize=256)
- def css_to_xpath(self, css, prefix="descendant-or-self::"):
+ def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
return super().css_to_xpath(css, prefix)
class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
@lru_cache(maxsize=256)
- def css_to_xpath(self, css, prefix="descendant-or-self::"):
+ def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
return super().css_to_xpath(css, prefix)
_translator = HTMLTranslator()
-def css2xpath(query):
+def css2xpath(query: str) -> str:
"Return translated XPath version of a given CSS query"
return _translator.css_to_xpath(query)
diff --git a/parsel/selector.py b/parsel/selector.py
index bb5312d2..d734f560 100644
--- a/parsel/selector.py
+++ b/parsel/selector.py
@@ -1,45 +1,43 @@
-"""
-XPath selectors based on lxml
-"""
+"""XPath and JMESPath selectors based on the lxml and jmespath Python
+packages."""
+import json
import typing
import warnings
+from io import BytesIO
from typing import (
Any,
Dict,
List,
+ Literal,
Mapping,
Optional,
Pattern,
+ SupportsIndex,
+ Tuple,
Type,
+ TypedDict,
TypeVar,
Union,
)
from warnings import warn
-from cssselect import GenericTranslator as OriginalGenericTranslator
+import html_text # type: ignore[import]
+import jmespath
from lxml import etree, html
from lxml.html.clean import Cleaner # pylint: disable=no-name-in-module
-import html_text # type: ignore[import]
from packaging.version import Version
from .csstranslator import GenericTranslator, HTMLTranslator
from .utils import extract_regex, flatten, iflatten, shorten
-
-if typing.TYPE_CHECKING:
- # both require Python 3.8
- from typing import Literal, SupportsIndex
-
- # simplified _OutputMethodArg from types-lxml
- _TostringMethodType = Literal[
- "html",
- "xml",
- ]
-
-
_SelectorType = TypeVar("_SelectorType", bound="Selector")
-_ParserType = Union[etree.XMLParser, etree.HTMLParser]
+_ParserType = Union[etree.XMLParser, etree.HTMLParser] # type: ignore[type-arg]
+# simplified _OutputMethodArg from types-lxml
+_TostringMethodType = Literal[
+ "html",
+ "xml",
+]
lxml_version = Version(etree.__version__)
lxml_huge_tree_version = Version("4.2")
@@ -58,13 +56,19 @@ class CannotDropElementWithoutParent(CannotRemoveElementWithoutParent):
pass
-class SafeXMLParser(etree.XMLParser):
- def __init__(self, *args, **kwargs) -> None:
+class SafeXMLParser(etree.XMLParser): # type: ignore[type-arg]
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
kwargs.setdefault("resolve_entities", False)
super().__init__(*args, **kwargs)
-_ctgroup = {
+class CTGroupValue(TypedDict):
+ _parser: Union[Type[etree.XMLParser], Type[html.HTMLParser]] # type: ignore[type-arg]
+ _csstranslator: Union[GenericTranslator, HTMLTranslator]
+ _tostring_method: str
+
+
+_ctgroup: Dict[str, CTGroupValue] = {
"html": {
"_parser": html.HTMLParser,
"_csstranslator": HTMLTranslator(),
@@ -78,13 +82,8 @@ def __init__(self, *args, **kwargs) -> None:
}
-def _st(st: Optional[str]) -> str:
- if st is None:
- return "html"
- elif st in _ctgroup:
- return st
- else:
- raise ValueError(f"Invalid type: {st}")
+def _xml_or_html(type: Optional[str]) -> str:
+ return "xml" if type == "xml" else "html"
def create_root_node(
@@ -92,16 +91,21 @@ def create_root_node(
parser_cls: Type[_ParserType],
base_url: Optional[str] = None,
huge_tree: bool = LXML_SUPPORTS_HUGE_TREE,
+ body: bytes = b"",
+ encoding: str = "utf8",
) -> etree._Element:
"""Create root node for text using given parser class."""
- body = text.strip().replace("\x00", "").encode("utf8") or b""
+ if not text:
+ body = body.replace(b"\x00", b"").strip()
+ else:
+ body = text.strip().replace("\x00", "").encode(encoding) or b""
+
if huge_tree and LXML_SUPPORTS_HUGE_TREE:
- parser = parser_cls(recover=True, encoding="utf8", huge_tree=True)
- # the stub wrongly thinks base_url can't be None
- root = etree.fromstring(body, parser=parser, base_url=base_url) # type: ignore[arg-type]
+ parser = parser_cls(recover=True, encoding=encoding, huge_tree=True)
+ root = etree.fromstring(body, parser=parser, base_url=base_url)
else:
- parser = parser_cls(recover=True, encoding="utf8")
- root = etree.fromstring(body, parser=parser, base_url=base_url) # type: ignore[arg-type]
+ parser = parser_cls(recover=True, encoding=encoding)
+ root = etree.fromstring(body, parser=parser, base_url=base_url)
for error in parser.error_log:
if "use XML_PARSE_HUGE option" in error.message:
warnings.warn(
@@ -132,26 +136,38 @@ def __getitem__(
) -> Union[_SelectorType, "SelectorList[_SelectorType]"]:
o = super().__getitem__(pos)
if isinstance(pos, slice):
- return self.__class__(
- typing.cast("SelectorList[_SelectorType]", o)
- )
+ return self.__class__(typing.cast("SelectorList[_SelectorType]", o))
else:
return typing.cast(_SelectorType, o)
def __getstate__(self) -> None:
raise TypeError("can't pickle SelectorList objects")
+ def jmespath(self, query: str, **kwargs: Any) -> "SelectorList[_SelectorType]":
+ """
+ Call the ``.jmespath()`` method for each element in this list and return
+ their results flattened as another :class:`SelectorList`.
+
+ ``query`` is the same argument as the one in :meth:`Selector.jmespath`.
+
+ Any additional named arguments are passed to the underlying
+ ``jmespath.search`` call, e.g.::
+
+ selector.jmespath('author.name', options=jmespath.Options(dict_cls=collections.OrderedDict))
+ """
+ return self.__class__(flatten([x.jmespath(query, **kwargs) for x in self]))
+
def xpath(
self,
xpath: str,
namespaces: Optional[Mapping[str, str]] = None,
- **kwargs,
+ **kwargs: Any,
) -> "SelectorList[_SelectorType]":
"""
Call the ``.xpath()`` method for each element in this list and return
their results flattened as another :class:`SelectorList`.
- ``query`` is the same argument as the one in :meth:`Selector.xpath`
+ ``xpath`` is the same argument as the one in :meth:`Selector.xpath`
``namespaces`` is an optional ``prefix: namespace-uri`` mapping (dict)
for additional prefixes to those registered with ``register_namespace(prefix, uri)``.
@@ -164,9 +180,7 @@ def xpath(
selector.xpath('//a[href=$url]', url="http://www.example.com")
"""
return self.__class__(
- flatten(
- [x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self]
- )
+ flatten([x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self])
)
def css(self, query: str) -> "SelectorList[_SelectorType]":
@@ -190,9 +204,7 @@ def re(
Passing ``replace_entities`` as ``False`` switches off these
replacements.
"""
- return flatten(
- [x.re(regex, replace_entities=replace_entities) for x in self]
- )
+ return flatten([x.re(regex, replace_entities=replace_entities) for x in self])
@typing.overload
def re_first(
@@ -232,7 +244,7 @@ def re_first(
for el in iflatten(
x.re(regex, replace_entities=replace_entities) for x in self
):
- return el
+ return typing.cast(str, el)
return default
def getall(
@@ -328,15 +340,107 @@ def drop(self) -> None:
x.drop()
+_NOT_SET = object()
+
+
+def _get_root_from_text(text: str, *, type: str, **lxml_kwargs: Any) -> etree._Element:
+ return create_root_node(text, _ctgroup[type]["_parser"], **lxml_kwargs)
+
+
+def _get_root_and_type_from_bytes(
+ body: bytes,
+ encoding: str,
+ *,
+ input_type: Optional[str],
+ **lxml_kwargs: Any,
+) -> Tuple[Any, str]:
+ if input_type == "text":
+ return body.decode(encoding), input_type
+ if encoding == "utf8":
+ try:
+ data = json.load(BytesIO(body))
+ except ValueError:
+ data = _NOT_SET
+ if data is not _NOT_SET:
+ return data, "json"
+ if input_type == "json":
+ return None, "json"
+ assert input_type in ("html", "xml", None) # nosec
+ type = _xml_or_html(input_type)
+ root = create_root_node(
+ text="",
+ body=body,
+ encoding=encoding,
+ parser_cls=_ctgroup[type]["_parser"],
+ **lxml_kwargs,
+ )
+ return root, type
+
+
+def _get_root_and_type_from_text(
+ text: str, *, input_type: Optional[str], **lxml_kwargs: Any
+) -> Tuple[Any, str]:
+ if input_type == "text":
+ return text, input_type
+ try:
+ data = json.loads(text)
+ except ValueError:
+ data = _NOT_SET
+ if data is not _NOT_SET:
+ return data, "json"
+ if input_type == "json":
+ return None, "json"
+ assert input_type in ("html", "xml", None) # nosec
+ type = _xml_or_html(input_type)
+ root = _get_root_from_text(text, type=type, **lxml_kwargs)
+ return root, type
+
+
+def _get_root_type(root: Any, *, input_type: Optional[str]) -> str:
+ if isinstance(root, etree._Element): # pylint: disable=protected-access
+ if input_type in {"json", "text"}:
+ raise ValueError(
+ f"Selector got an lxml.etree._Element object as root, "
+ f"and {input_type!r} as type."
+ )
+ return _xml_or_html(input_type)
+ elif isinstance(root, (dict, list)) or _is_valid_json(root):
+ return "json"
+ return input_type or "json"
+
+
+def _is_valid_json(text: str) -> bool:
+ try:
+ json.loads(text)
+ except (TypeError, ValueError):
+ return False
+ else:
+ return True
+
+
+def _load_json_or_none(text: str) -> Any:
+ if isinstance(text, (str, bytes, bytearray)):
+ try:
+ return json.loads(text)
+ except ValueError:
+ return None
+ return None
+
+
class Selector:
- """
- :class:`Selector` allows you to select parts of an XML or HTML text using CSS
- or XPath expressions and extract data from it.
+ """Wrapper for input data in HTML, JSON, or XML format, that allows
+ selecting parts of it using selection expressions.
+
+ You can write selection expressions in CSS or XPath for HTML and XML
+ inputs, or in JMESPath for JSON inputs.
- ``text`` is a `str`` object
+ ``text`` is an ``str`` object.
- ``type`` defines the selector type, it can be ``"html"``, ``"xml"`` or ``None`` (default).
- If ``type`` is ``None``, the selector defaults to ``"html"``.
+ ``body`` is a ``bytes`` object. It can be used together with the
+ ``encoding`` argument instead of the ``text`` argument.
+
+ ``type`` defines the selector type. It can be ``"html"`` (default),
+ ``"json"``, or ``"xml"``.
``base_url`` allows setting a URL for the document. This is needed when looking up external entities with relative paths.
See the documentation for :func:`lxml.etree.fromstring` for more information.
@@ -351,18 +455,16 @@ class Selector:
"""
__slots__ = [
- "text",
"namespaces",
"type",
"_expr",
+ "_huge_tree",
"root",
+ "_text",
+ "body",
"__weakref__",
- "_parser",
- "_csstranslator",
- "_tostring_method",
]
- _default_type: Optional[str] = None
_default_namespaces = {
"re": "http://exslt.org/regular-expressions",
# supported in libxslt:
@@ -382,55 +484,139 @@ def __init__(
self,
text: Optional[str] = None,
type: Optional[str] = None,
+ body: bytes = b"",
+ encoding: str = "utf8",
namespaces: Optional[Mapping[str, str]] = None,
- root: Optional[Any] = None,
+ root: Optional[Any] = _NOT_SET,
base_url: Optional[str] = None,
_expr: Optional[str] = None,
huge_tree: bool = LXML_SUPPORTS_HUGE_TREE,
) -> None:
- self.type = st = _st(type or self._default_type)
- self._parser: Type[_ParserType] = typing.cast(
- Type[_ParserType], _ctgroup[st]["_parser"]
- )
- self._csstranslator: OriginalGenericTranslator = typing.cast(
- OriginalGenericTranslator, _ctgroup[st]["_csstranslator"]
- )
- self._tostring_method: "_TostringMethodType" = typing.cast(
- "_TostringMethodType", _ctgroup[st]["_tostring_method"]
- )
+ self.root: Any
+ if type not in ("html", "json", "text", "xml", None):
+ raise ValueError(f"Invalid type: {type}")
+
+ if text is None and not body and root is _NOT_SET:
+ raise ValueError("Selector needs text, body, or root arguments")
+
+ if text is not None and not isinstance(text, str):
+ msg = f"text argument should be of type str, got {text.__class__}"
+ raise TypeError(msg)
if text is not None:
+ if root is not _NOT_SET:
+ warnings.warn(
+ "Selector got both text and root, root is being ignored.",
+ stacklevel=2,
+ )
if not isinstance(text, str):
msg = f"text argument should be of type str, got {text.__class__}"
raise TypeError(msg)
- root = self._get_root(text, base_url, huge_tree)
- elif root is None:
- raise ValueError("Selector needs either text or root argument")
+
+ root, type = _get_root_and_type_from_text(
+ text,
+ input_type=type,
+ base_url=base_url,
+ huge_tree=huge_tree,
+ )
+ self.root = root
+ self.type = type
+ elif body:
+ if not isinstance(body, bytes):
+ msg = f"body argument should be of type bytes, got {body.__class__}"
+ raise TypeError(msg)
+ root, type = _get_root_and_type_from_bytes(
+ body=body,
+ encoding=encoding,
+ input_type=type,
+ base_url=base_url,
+ huge_tree=huge_tree,
+ )
+ self.root = root
+ self.type = type
+ elif root is _NOT_SET:
+ raise ValueError("Selector needs text, body, or root arguments")
+ else:
+ self.root = root
+ self.type = _get_root_type(root, input_type=type)
self.namespaces = dict(self._default_namespaces)
if namespaces is not None:
self.namespaces.update(namespaces)
- self.root = root
+
self._expr = _expr
+ self._huge_tree = huge_tree
+ self._text = text
def __getstate__(self) -> Any:
raise TypeError("can't pickle Selector objects")
def _get_root(
self,
- text: str,
+ text: str = "",
base_url: Optional[str] = None,
huge_tree: bool = LXML_SUPPORTS_HUGE_TREE,
+ type: Optional[str] = None,
+ body: bytes = b"",
+ encoding: str = "utf8",
) -> etree._Element:
return create_root_node(
- text, self._parser, base_url=base_url, huge_tree=huge_tree
+ text,
+ body=body,
+ encoding=encoding,
+ parser_cls=_ctgroup[type or self.type]["_parser"],
+ base_url=base_url,
+ huge_tree=huge_tree,
)
+ def jmespath(
+ self: _SelectorType,
+ query: str,
+ **kwargs: Any,
+ ) -> SelectorList[_SelectorType]:
+ """
+ Find objects matching the JMESPath ``query`` and return the result as a
+ :class:`SelectorList` instance with all elements flattened. List
+ elements implement :class:`Selector` interface too.
+
+ ``query`` is a string containing the `JMESPath
+ `_ query to apply.
+
+ Any additional named arguments are passed to the underlying
+ ``jmespath.search`` call, e.g.::
+
+ selector.jmespath('author.name', options=jmespath.Options(dict_cls=collections.OrderedDict))
+ """
+ if self.type == "json":
+ if isinstance(self.root, str):
+ # Selector received a JSON string as root.
+ data = _load_json_or_none(self.root)
+ else:
+ data = self.root
+ else:
+ assert self.type in {"html", "xml"} # nosec
+ data = _load_json_or_none(self.root.text)
+
+ result = jmespath.search(query, data, **kwargs)
+ if result is None:
+ result = []
+ elif not isinstance(result, list):
+ result = [result]
+
+ def make_selector(x: Any) -> _SelectorType: # closure function
+ if isinstance(x, str):
+ return self.__class__(text=x, _expr=query, type="text")
+ else:
+ return self.__class__(root=x, _expr=query)
+
+ result = [make_selector(x) for x in result]
+ return typing.cast(SelectorList[_SelectorType], self.selectorlist_cls(result))
+
def xpath(
self: _SelectorType,
query: str,
namespaces: Optional[Mapping[str, str]] = None,
- **kwargs,
+ **kwargs: Any,
) -> SelectorList[_SelectorType]:
"""
Find nodes matching the xpath ``query`` and return the result as a
@@ -449,12 +635,22 @@ def xpath(
selector.xpath('//a[href=$url]', url="http://www.example.com")
"""
- try:
- xpathev = self.root.xpath
- except AttributeError:
- return typing.cast(
- SelectorList[_SelectorType], self.selectorlist_cls([])
- )
+ if self.type not in ("html", "xml", "text"):
+ raise ValueError(f"Cannot use xpath on a Selector of type {self.type!r}")
+ if self.type in ("html", "xml"):
+ try:
+ xpathev = self.root.xpath
+ except AttributeError:
+ return typing.cast(
+ SelectorList[_SelectorType], self.selectorlist_cls([])
+ )
+ else:
+ try:
+ xpathev = self._get_root(self._text or "", type="html").xpath
+ except AttributeError:
+ return typing.cast(
+ SelectorList[_SelectorType], self.selectorlist_cls([])
+ )
nsp = dict(self.namespaces)
if namespaces is not None:
@@ -474,13 +670,14 @@ def xpath(
result = [
self.__class__(
- root=x, _expr=query, namespaces=self.namespaces, type=self.type
+ root=x,
+ _expr=query,
+ namespaces=self.namespaces,
+ type=_xml_or_html(self.type),
)
for x in result
]
- return typing.cast(
- SelectorList[_SelectorType], self.selectorlist_cls(result)
- )
+ return typing.cast(SelectorList[_SelectorType], self.selectorlist_cls(result))
def css(self: _SelectorType, query: str) -> SelectorList[_SelectorType]:
"""
@@ -493,10 +690,13 @@ def css(self: _SelectorType, query: str) -> SelectorList[_SelectorType]:
.. _cssselect: https://pypi.python.org/pypi/cssselect/
"""
+ if self.type not in ("html", "xml", "text"):
+ raise ValueError(f"Cannot use css on a Selector of type {self.type!r}")
return self.xpath(self._css2xpath(query))
def _css2xpath(self, query: str) -> str:
- return self._csstranslator.css_to_xpath(query)
+ type = _xml_or_html(self.type)
+ return _ctgroup[type]["_csstranslator"].css_to_xpath(query)
def re(
self, regex: Union[str, Pattern[str]], replace_entities: bool = True
@@ -513,9 +713,8 @@ def re(
Passing ``replace_entities`` as ``False`` switches off these
replacements.
"""
- return extract_regex(
- regex, self.get(), replace_entities=replace_entities
- )
+ data = self.get()
+ return extract_regex(regex, data, replace_entities=replace_entities)
@typing.overload
def re_first(
@@ -563,10 +762,12 @@ def get(
cleaner: Union[str, None, Cleaner] = "auto",
guess_punct_space: bool = True,
guess_layout: bool = True,
- ) -> str:
+ ) -> Any:
"""
- Serialize and return the matched nodes in a single string.
- Percent encoded content is unquoted.
+ Serialize and return the matched nodes.
+
+ For HTML and XML, the result is always a string, and percent-encoded
+ content is unquoted.
When ``text`` is False (default), HTML or XML is extracted. Pass
``text=True`` to extract text content (html-text library is used).
@@ -601,6 +802,10 @@ def get(
be just a single line of text, using whitespaces as separators.
This option has no effect when ``text=False``.
"""
+ if self.type in ("text", "json"):
+ # TODO: what should be the behavior with text=True?
+ return self.root
+
sel = self
if cleaner == "auto":
if text:
@@ -617,11 +822,14 @@ def get(
)
try:
- return etree.tostring(
- tree,
- method=self._tostring_method,
- encoding="unicode",
- with_tail=False,
+ return typing.cast(
+ str,
+ etree.tostring(
+ tree,
+ method=_ctgroup[self.type]["_tostring_method"],
+ encoding="unicode",
+ with_tail=False,
+ ),
)
except (AttributeError, TypeError):
if tree is True:
@@ -674,10 +882,7 @@ def remove_namespaces(self) -> None:
# loop on element attributes also
for an in el.attrib:
if an.startswith("{"):
- # this cast shouldn't be needed as pop never returns None
- el.attrib[an.split("}", 1)[1]] = typing.cast(
- str, el.attrib.pop(an)
- )
+ el.attrib[an.split("}", 1)[1]] = el.attrib.pop(an)
# remove namespace declarations
etree.cleanup_namespaces(self.root)
@@ -702,7 +907,7 @@ def remove(self) -> None:
)
try:
- parent.remove(self.root) # type: ignore[union-attr]
+ parent.remove(self.root)
except AttributeError:
# 'NoneType' object has no attribute 'remove'
raise CannotRemoveElementWithoutParent(
@@ -710,7 +915,7 @@ def remove(self) -> None:
"are you trying to remove a root element?"
)
- def drop(self):
+ def drop(self) -> None:
"""
Drop matched nodes from the parent element.
"""
@@ -727,9 +932,11 @@ def drop(self):
try:
if self.type == "xml":
+ if parent is None:
+ raise ValueError("This node has no parent")
parent.remove(self.root)
else:
- self.root.drop_tree()
+ typing.cast(html.HtmlElement, self.root).drop_tree()
except (AttributeError, AssertionError):
# 'NoneType' object has no attribute 'drop'
raise CannotDropElementWithoutParent(
@@ -787,7 +994,8 @@ def __bool__(self) -> bool:
__nonzero__ = __bool__
def __str__(self) -> str:
- data = repr(shorten(self.get(), width=40))
- return f"<{type(self).__name__} xpath={self._expr!r} data={data}>"
+ return str(self.get())
- __repr__ = __str__
+ def __repr__(self) -> str:
+ data = repr(shorten(str(self.get()), width=40))
+ return f"<{type(self).__name__} query={self._expr!r} data={data}>"
diff --git a/parsel/utils.py b/parsel/utils.py
index 5e6d92de..ec77d74b 100644
--- a/parsel/utils.py
+++ b/parsel/utils.py
@@ -1,9 +1,10 @@
import re
-from typing import Any, List, Pattern, Union, cast, Match
+from typing import Any, Iterable, Iterator, List, Match, Pattern, Union, cast
+
from w3lib.html import replace_entities as w3lib_replace_entities
-def flatten(x):
+def flatten(x: Iterable[Any]) -> List[Any]:
"""flatten(sequence) -> list
Returns a single, flat list which contains all elements retrieved
from the sequence and all recursively contained sub-sequences
@@ -21,7 +22,7 @@ def flatten(x):
return list(iflatten(x))
-def iflatten(x):
+def iflatten(x: Iterable[Any]) -> Iterator[Any]:
"""iflatten(sequence) -> Iterator
Similar to ``.flatten()``, but returns iterator instead"""
for el in x:
diff --git a/parsel/xpathfuncs.py b/parsel/xpathfuncs.py
index 9e5c0a96..7633d107 100644
--- a/parsel/xpathfuncs.py
+++ b/parsel/xpathfuncs.py
@@ -1,13 +1,14 @@
import re
-from lxml import etree
+from typing import Any, Callable, Optional
+from lxml import etree
from w3lib.html import HTML5_WHITESPACE
regex = f"[{HTML5_WHITESPACE}]+"
replace_html5_whitespaces = re.compile(regex).sub
-def set_xpathfunc(fname, func):
+def set_xpathfunc(fname: str, func: Optional[Callable]) -> None: # type: ignore[type-arg]
"""Register a custom extension function to use in XPath expressions.
The function ``func`` registered under ``fname`` identifier will be called
@@ -28,11 +29,11 @@ def set_xpathfunc(fname, func):
del ns_fns[fname]
-def setup():
+def setup() -> None:
set_xpathfunc("has-class", has_class)
-def has_class(context, *classes):
+def has_class(context: Any, *classes: str) -> bool:
"""has-class function.
Return True if all ``classes`` are present in element's class attr.
@@ -40,14 +41,10 @@ def has_class(context, *classes):
"""
if not context.eval_context.get("args_checked"):
if not classes:
- raise ValueError(
- "XPath error: has-class must have at least 1 argument"
- )
+ raise ValueError("XPath error: has-class must have at least 1 argument")
for c in classes:
if not isinstance(c, str):
- raise ValueError(
- "XPath error: has-class arguments must be strings"
- )
+ raise ValueError("XPath error: has-class arguments must be strings")
context.eval_context["args_checked"] = True
node_cls = context.context_node.get("class")
diff --git a/pylintrc b/pylintrc
index 1892721c..c909c457 100644
--- a/pylintrc
+++ b/pylintrc
@@ -4,7 +4,6 @@ persistent=no
[MESSAGES CONTROL]
disable=c-extension-no-member,
- deprecated-method,
fixme,
import-error,
import-outside-toplevel,
@@ -16,6 +15,7 @@ disable=c-extension-no-member,
no-else-return,
no-member,
parse-error,
+ protected-access,
raise-missing-from,
redefined-builtin,
too-few-public-methods,
@@ -26,4 +26,4 @@ disable=c-extension-no-member,
unused-argument,
use-a-generator,
wrong-import-order,
- wrong-import-position
+ wrong-import-position,
diff --git a/setup.cfg b/setup.cfg
index cf0f47f0..7c964b49 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,2 @@
[wheel]
universal=1
-
-[aliases]
-test=pytest
diff --git a/setup.py b/setup.py
index 90db4dee..e88321f5 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,6 @@
from setuptools import setup
-
with open("README.rst", encoding="utf-8") as readme_file:
readme = readme_file.read()
@@ -11,9 +10,10 @@
setup(
name="parsel",
- version="1.7.0",
+ version="1.9.1",
description="Parsel is a library to extract data from HTML and XML using XPath and CSS selectors",
long_description=readme + "\n\n" + history,
+ long_description_content_type="text/x-rst",
author="Scrapy project",
author_email="info@scrapy.org",
url="https://github.com/scrapy/parsel",
@@ -25,13 +25,14 @@
},
include_package_data=True,
install_requires=[
- "cssselect>=0.9",
+ "cssselect>=1.2.0",
+ "jmespath",
"lxml",
"packaging",
"w3lib>=1.19.0",
"html-text>=0.5.2",
],
- python_requires=">=3.7",
+ python_requires=">=3.8",
license="BSD",
zip_safe=False,
keywords="parsel",
@@ -44,17 +45,14 @@
"Topic :: Text Processing :: Markup :: HTML",
"Topic :: Text Processing :: Markup :: XML",
"Programming Language :: Python :: 3",
- "Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
],
- setup_requires=[
- "pytest-runner",
- ],
tests_require=[
"pytest",
],
diff --git a/tests/test_selector.py b/tests/test_selector.py
index 47ce04a8..8b5e554f 100644
--- a/tests/test_selector.py
+++ b/tests/test_selector.py
@@ -1,18 +1,20 @@
+import pickle
import re
+import unittest
import warnings
import weakref
-import unittest
-import pickle
-from typing import Any, cast
+from typing import Any, Mapping, Optional, cast
from lxml import etree
from lxml.html import HtmlElement
-from pkg_resources import parse_version
+from packaging.version import Version
from parsel import Selector, SelectorList
from parsel.selector import (
- CannotRemoveElementWithoutRoot,
+ _NOT_SET,
+ LXML_SUPPORTS_HUGE_TREE,
CannotRemoveElementWithoutParent,
+ CannotRemoveElementWithoutRoot,
)
@@ -28,9 +30,7 @@ def assertIsSelectorList(self, value: Any) -> None:
def test_pickle_selector(self) -> None:
sel = self.sscls(text="some text
")
- self.assertRaises(
- TypeError, lambda s: pickle.dumps(s, protocol=2), sel
- )
+ self.assertRaises(TypeError, lambda s: pickle.dumps(s, protocol=2), sel)
def test_pickle_selector_list(self) -> None:
sel = self.sscls(
@@ -40,9 +40,7 @@ def test_pickle_selector_list(self) -> None:
empty_sel_list = sel.css("p")
self.assertIsSelectorList(sel_list)
self.assertIsSelectorList(empty_sel_list)
- self.assertRaises(
- TypeError, lambda s: pickle.dumps(s, protocol=2), sel_list
- )
+ self.assertRaises(TypeError, lambda s: pickle.dumps(s, protocol=2), sel_list)
self.assertRaises(
TypeError, lambda s: pickle.dumps(s, protocol=2), empty_sel_list
)
@@ -63,7 +61,8 @@ def test_simple_selection(self) -> None:
)
self.assertEqual(
- [x.extract() for x in sel.xpath("//input[@name='a']/@name")], ["a"]
+ [x.extract() for x in sel.xpath("//input[@name='a']/@name")],
+ ["a"],
)
self.assertEqual(
[
@@ -94,10 +93,7 @@ def test_simple_selection_with_variables(self) -> None:
sel = self.sscls(text=body)
self.assertEqual(
- [
- x.extract()
- for x in sel.xpath("//input[@value=$number]/@name", number=1)
- ],
+ [x.extract() for x in sel.xpath("//input[@value=$number]/@name", number=1)],
["a"],
)
self.assertEqual(
@@ -119,15 +115,11 @@ def test_simple_selection_with_variables(self) -> None:
# you can also pass booleans
self.assertEqual(
- sel.xpath(
- "boolean(count(//input)=$cnt)=$test", cnt=2, test=True
- ).extract(),
+ sel.xpath("boolean(count(//input)=$cnt)=$test", cnt=2, test=True).extract(),
["1"],
)
self.assertEqual(
- sel.xpath(
- "boolean(count(//input)=$cnt)=$test", cnt=4, test=True
- ).extract(),
+ sel.xpath("boolean(count(//input)=$cnt)=$test", cnt=4, test=True).extract(),
["0"],
)
self.assertEqual(
@@ -157,16 +149,11 @@ def test_simple_selection_with_variables_escape_friendly(self) -> None:
t = 'I say "Yeah!"'
# naive string formatting with give something like:
# ValueError: XPath error: Invalid predicate in //input[@value="I say "Yeah!""]/@name
- self.assertRaises(
- ValueError, sel.xpath, f'//input[@value="{t}"]/@name'
- )
+ self.assertRaises(ValueError, sel.xpath, f'//input[@value="{t}"]/@name')
# with XPath variables, escaping is done for you
self.assertEqual(
- [
- x.extract()
- for x in sel.xpath("//input[@value=$text]/@name", text=t)
- ],
+ [x.extract() for x in sel.xpath("//input[@value=$text]/@name", text=t)],
["a"],
)
lt = """I'm mixing single and "double quotes" and I don't care :)"""
@@ -179,9 +166,7 @@ def test_simple_selection_with_variables_escape_friendly(self) -> None:
self.assertEqual(
[
x.extract()
- for x in sel.xpath(
- "//p[normalize-space()=$lng]//@name", lng=lt
- )
+ for x in sel.xpath("//p[normalize-space()=$lng]//@name", lng=lt)
],
["a"],
)
@@ -205,9 +190,7 @@ def test_accessing_attributes(self) -> None:
)
# for a SelectorList, bring the attributes of first-element only
- self.assertEqual(
- {"id": "some-list", "class": "list-cls"}, sel.css("ul").attrib
- )
+ self.assertEqual({"id": "some-list", "class": "list-cls"}, sel.css("ul").attrib)
self.assertEqual(
{"class": "item-cls", "id": "list-item-1"}, sel.css("li").attrib
)
@@ -227,9 +210,7 @@ def test_representation_slice(self) -> None:
body = f""
sel = self.sscls(text=body)
- representation = (
- f""
- )
+ representation = f""
self.assertEqual(
[repr(it) for it in sel.xpath("//input/@name")], [representation]
@@ -238,9 +219,7 @@ def test_representation_slice(self) -> None:
def test_representation_unicode_query(self) -> None:
body = f""
- representation = (
- ""
- )
+ representation = ""
sel = self.sscls(text=body)
self.assertEqual(
@@ -299,9 +278,7 @@ def test_selector_get_alias(self) -> None:
self.assertEqual(
sel.xpath("//ul/li[position()>1]")[0].get(), '2'
)
- self.assertEqual(
- sel.xpath("//ul/li[position()>1]/text()")[0].get(), "2"
- )
+ self.assertEqual(sel.xpath("//ul/li[position()>1]/text()")[0].get(), "2")
def test_selector_getall_alias(self) -> None:
"""Test if get() returns extracted value on a Selector"""
@@ -371,9 +348,7 @@ def test_extract_first_re_default(self) -> None:
def test_select_unicode_query(self) -> None:
body = ""
sel = self.sscls(text=body)
- self.assertEqual(
- sel.xpath('//input[@name="\xa9"]/@value').extract(), ["1"]
- )
+ self.assertEqual(sel.xpath('//input[@name="\xa9"]/@value').extract(), ["1"])
def test_list_elements_type(self) -> None:
"""Test Selector returning the same type in selection methods"""
@@ -390,12 +365,8 @@ def test_list_elements_type(self) -> None:
def test_boolean_result(self) -> None:
body = ""
xs = self.sscls(text=body)
- self.assertEqual(
- xs.xpath("//input[@name='a']/@name='a'").extract(), ["1"]
- )
- self.assertEqual(
- xs.xpath("//input[@name='a']/@name='n'").extract(), ["0"]
- )
+ self.assertEqual(xs.xpath("//input[@name='a']/@name='a'").extract(), ["1"])
+ self.assertEqual(xs.xpath("//input[@name='a']/@name='n'").extract(), ["0"])
def test_differences_parsing_xml_vs_html(self) -> None:
"""Test that XML and HTML Selector's behave differently"""
@@ -419,7 +390,7 @@ def test_error_for_unknown_selector_type(self) -> None:
def test_text_or_root_is_required(self) -> None:
self.assertRaisesRegex(
ValueError,
- "Selector needs either text or root argument",
+ "Selector needs text, body, or root arguments",
self.sscls,
)
@@ -525,9 +496,7 @@ def test_mixed_nested_selectors(self) -> None:
self.assertEqual(
sel.xpath('//div[@id="1"]').css("span::text").extract(), ["me"]
)
- self.assertEqual(
- sel.css("#1").xpath("./span/text()").extract(), ["me"]
- )
+ self.assertEqual(sel.css("#1").xpath("./span/text()").extract(), ["me"])
def test_dont_strip(self) -> None:
sel = self.sscls(text='')
@@ -558,7 +527,8 @@ def test_namespaces_adhoc(self) -> None:
self.assertEqual(
x.xpath(
- "//somens:a/text()", namespaces={"somens": "http://scrapy.org"}
+ "//somens:a/text()",
+ namespaces={"somens": "http://scrapy.org"},
).extract(),
["take this"],
)
@@ -601,16 +571,12 @@ def test_namespaces_multiple(self) -> None:
x.register_namespace("b", "http://somens.com")
self.assertEqual(len(x.xpath("//xmlns:TestTag")), 1)
self.assertEqual(x.xpath("//b:Operation/text()").extract()[0], "hello")
- self.assertEqual(
- x.xpath("//xmlns:TestTag/@b:att").extract()[0], "value"
- )
+ self.assertEqual(x.xpath("//xmlns:TestTag/@b:att").extract()[0], "value")
self.assertEqual(
x.xpath("//p:SecondTestTag/xmlns:price/text()").extract()[0], "90"
)
self.assertEqual(
- x.xpath("//p:SecondTestTag")
- .xpath("./xmlns:price/text()")[0]
- .extract(),
+ x.xpath("//p:SecondTestTag").xpath("./xmlns:price/text()")[0].extract(),
"90",
)
self.assertEqual(
@@ -652,7 +618,8 @@ def test_namespaces_multiple_adhoc(self) -> None:
# "xmlns" is still defined
self.assertEqual(
x.xpath(
- "//xmlns:TestTag/@b:att", namespaces={"b": "http://somens.com"}
+ "//xmlns:TestTag/@b:att",
+ namespaces={"b": "http://somens.com"},
).extract()[0],
"value",
)
@@ -706,9 +673,7 @@ def test_namespaces_multiple_adhoc(self) -> None:
)
# "p" prefix is not cached from previous calls
- self.assertRaises(
- ValueError, x.xpath, "//p:SecondTestTag/xmlns:price/text()"
- )
+ self.assertRaises(ValueError, x.xpath, "//p:SecondTestTag/xmlns:price/text()")
x.register_namespace("p", "http://www.scrapy.org/product")
self.assertEqual(
@@ -780,9 +745,7 @@ def test_re_replace_entities(self) -> None:
)
self.assertEqual(
- x.xpath("//script/text()").re_first(
- name_re, replace_entities=False
- ),
+ x.xpath("//script/text()").re_first(name_re, replace_entities=False),
expected,
)
self.assertEqual(
@@ -793,15 +756,11 @@ def test_re_replace_entities(self) -> None:
def test_re_intl(self) -> None:
body = "Evento: cumplea\xf1os
"
x = self.sscls(text=body)
- self.assertEqual(
- x.xpath("//div").re(r"Evento: (\w+)"), ["cumplea\xf1os"]
- )
+ self.assertEqual(x.xpath("//div").re(r"Evento: (\w+)"), ["cumplea\xf1os"])
def test_selector_over_text(self) -> None:
hs = self.sscls(text="lala")
- self.assertEqual(
- hs.extract(), "lala"
- )
+ self.assertEqual(hs.extract(), "lala")
xs = self.sscls(text="lala", type="xml")
self.assertEqual(xs.extract(), "lala")
self.assertEqual(xs.xpath(".").extract(), ["lala"])
@@ -827,17 +786,13 @@ def test_http_header_encoding_precedence(self) -> None:
\xa3"""
x = self.sscls(text=text)
- self.assertEqual(
- x.xpath("//span[@id='blank']/text()").extract(), ["\xa3"]
- )
+ self.assertEqual(x.xpath("//span[@id='blank']/text()").extract(), ["\xa3"])
def test_empty_bodies_shouldnt_raise_errors(self) -> None:
self.sscls(text="").xpath("//text()").extract()
def test_bodies_with_comments_only(self) -> None:
- sel = self.sscls(
- text="", base_url="http://example.com"
- )
+ sel = self.sscls(text="", base_url="http://example.com")
self.assertEqual("http://example.com", sel.root.base)
def test_null_bytes_shouldnt_raise_errors(self) -> None:
@@ -863,9 +818,7 @@ def test_select_on_unevaluable_nodes(self) -> None:
self.assertEqual(x1.xpath(".//text()").extract(), [])
def test_select_on_text_nodes(self) -> None:
- r = self.sscls(
- text="Options:opt1
Otheropt2
"
- )
+ r = self.sscls(text="Options:opt1
Otheropt2
")
x1 = r.xpath(
"//div/descendant::text()[preceding-sibling::b[contains(text(), 'Options')]]"
)
@@ -879,9 +832,7 @@ def test_select_on_text_nodes(self) -> None:
@unittest.skip("Text nodes lost parent node reference in lxml")
def test_nested_select_on_text_nodes(self) -> None:
# FIXME: does not work with lxml backend [upstream]
- r = self.sscls(
- text="Options:opt1
Otheropt2
"
- )
+ r = self.sscls(text="Options:opt1
Otheropt2
")
x1 = r.xpath("//div/descendant::text()")
x2 = x1.xpath("./preceding-sibling::b[contains(text(), 'Options')]")
self.assertEqual(x2.extract(), ["Options:"])
@@ -934,17 +885,14 @@ def test_remove_namespaces_embedded(self) -> None:
self.assertEqual(
len(
sel.xpath(
- "//f:link", namespaces={"f": "http://www.w3.org/2005/Atom"}
+ "//f:link",
+ namespaces={"f": "http://www.w3.org/2005/Atom"},
)
),
2,
)
self.assertEqual(
- len(
- sel.xpath(
- "//s:stop", namespaces={"s": "http://www.w3.org/2000/svg"}
- )
- ),
+ len(sel.xpath("//s:stop", namespaces={"s": "http://www.w3.org/2000/svg"})),
2,
)
sel.remove_namespaces()
@@ -992,19 +940,13 @@ class SmartStringsSelector(Selector):
li_text = x.xpath("//li/text()")
self.assertFalse(any([hasattr(e.root, "getparent") for e in li_text]))
div_class = x.xpath("//div/@class")
- self.assertFalse(
- any([hasattr(e.root, "getparent") for e in div_class])
- )
+ self.assertFalse(any([hasattr(e.root, "getparent") for e in div_class]))
smart_x = SmartStringsSelector(text=body)
smart_li_text = smart_x.xpath("//li/text()")
- self.assertTrue(
- all([hasattr(e.root, "getparent") for e in smart_li_text])
- )
+ self.assertTrue(all([hasattr(e.root, "getparent") for e in smart_li_text]))
smart_div_class = smart_x.xpath("//div/@class")
- self.assertTrue(
- all([hasattr(e.root, "getparent") for e in smart_div_class])
- )
+ self.assertTrue(all([hasattr(e.root, "getparent") for e in smart_div_class]))
def test_xml_entity_expansion(self) -> None:
malicious_xml = (
@@ -1029,7 +971,7 @@ class MySelector(Selector):
selectorlist_cls = MySelectorList
def extra_method(self) -> str:
- return "extra" + self.get()
+ return "extra" + cast(str, self.get())
sel = MySelector(text="foo
")
self.assertIsInstance(sel.xpath("//div"), MySelectorList)
@@ -1123,9 +1065,9 @@ def test_drop_with_xml_type(self) -> None:
el.drop()
assert sel.get() == ""
- def test_deep_nesting(self):
- lxml_version = parse_version(etree.__version__)
- lxml_huge_tree_version = parse_version("4.2")
+ def test_deep_nesting(self) -> None:
+ lxml_version = Version(etree.__version__)
+ lxml_huge_tree_version = Version("4.2")
content = """
@@ -1194,6 +1136,81 @@ def test_deep_nesting(self):
self.assertEqual(len(sel.css("span")), nest_level)
self.assertEqual(len(sel.css("td")), 1)
+ def test_invalid_type(self) -> None:
+ with self.assertRaises(ValueError):
+ self.sscls("", type="xhtml")
+
+ def test_default_type(self) -> None:
+ text = "foo"
+ selector = self.sscls(text)
+ self.assertEqual(selector.type, "html")
+
+ def test_json_type(self) -> None:
+ obj = 1
+ selector = self.sscls(str(obj), type="json")
+ self.assertEqual(selector.root, obj)
+ self.assertEqual(selector.type, "json")
+
+ def test_html_root(self) -> None:
+ root = etree.fromstring("")
+ selector = self.sscls(root=root)
+ self.assertEqual(selector.root, root)
+ self.assertEqual(selector.type, "html")
+
+ def test_json_root(self) -> None:
+ obj = 1
+ selector = self.sscls(root=obj)
+ self.assertEqual(selector.root, obj)
+ self.assertEqual(selector.type, "json")
+
+ def test_json_xpath(self) -> None:
+ obj = 1
+ selector = self.sscls(root=obj)
+ with self.assertRaises(ValueError):
+ selector.xpath("//*")
+
+ def test_json_css(self) -> None:
+ obj = 1
+ selector = self.sscls(root=obj)
+ with self.assertRaises(ValueError):
+ selector.css("*")
+
+ def test_invalid_json(self) -> None:
+ text = ""
+ selector = self.sscls(text, type="json")
+ self.assertEqual(selector.root, None)
+ self.assertEqual(selector.type, "json")
+
+ def test_text_and_root_warning(self) -> None:
+ with warnings.catch_warnings(record=True) as w:
+ Selector(text="a", root="b")
+ self.assertIn("both text and root", str(w[0].message))
+
+ def test_etree_root_invalid_type(self) -> None:
+ selector = Selector("")
+ self.assertRaisesRegex(
+ ValueError,
+ "object as root",
+ Selector,
+ root=selector.root,
+ type="text",
+ )
+ self.assertRaisesRegex(
+ ValueError,
+ "object as root",
+ Selector,
+ root=selector.root,
+ type="json",
+ )
+
+ def test_json_selector_representation(self) -> None:
+ selector = Selector(text="true")
+ assert repr(selector) == ""
+ assert str(selector) == "True"
+ selector = Selector(text="1")
+ assert repr(selector) == ""
+ assert str(selector) == "1"
+
class ExsltTestCase(unittest.TestCase):
@@ -1214,30 +1231,18 @@ def test_regexp(self) -> None:
# re:test()
self.assertEqual(
sel.xpath('//input[re:test(@name, "[A-Z]+", "i")]').extract(),
- [
- x.extract()
- for x in sel.xpath('//input[re:test(@name, "[A-Z]+", "i")]')
- ],
+ [x.extract() for x in sel.xpath('//input[re:test(@name, "[A-Z]+", "i")]')],
)
self.assertEqual(
- [
- x.extract()
- for x in sel.xpath(r'//a[re:test(@href, "\.html$")]/text()')
- ],
+ [x.extract() for x in sel.xpath(r'//a[re:test(@href, "\.html$")]/text()')],
["first link", "second link"],
)
self.assertEqual(
- [
- x.extract()
- for x in sel.xpath('//a[re:test(@href, "first")]/text()')
- ],
+ [x.extract() for x in sel.xpath('//a[re:test(@href, "first")]/text()')],
["first link"],
)
self.assertEqual(
- [
- x.extract()
- for x in sel.xpath('//a[re:test(@href, "second")]/text()')
- ],
+ [x.extract() for x in sel.xpath('//a[re:test(@href, "second")]/text()')],
["second link"],
)
@@ -1267,9 +1272,7 @@ def test_regexp(self) -> None:
r're:replace(//a[re:test(@href, "\.xml$")]/@href,'
r'"(\w+)://(.+)(\.xml)", "","https://\2.html")'
).extract(),
- [
- "https://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.html"
- ],
+ ["https://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.html"],
)
def test_set(self) -> None:
@@ -1359,9 +1362,9 @@ def test_text_get(self):
self.assertEqual(txt, "title:\n\nsome text")
def test_text_getall(self):
- sel = self.sscls(
- text=""
- ).getall(text=True)
+ sel = self.sscls(text="").getall(
+ text=True
+ )
self.assertEqual(1, len(sel))
self.assertEqual("option1\noption2", sel[0])
@@ -1399,3 +1402,57 @@ def test_text_xpath_get(self):
html.xpath('//div[@class="product"]/span').getall(text=True),
["Price: 100", "Price: 200"],
)
+
+
+class SelectorBytesInput(Selector):
+ def __init__(
+ self,
+ text: Optional[str] = None,
+ type: Optional[str] = None,
+ body: bytes = b"",
+ encoding: str = "utf8",
+ namespaces: Optional[Mapping[str, str]] = None,
+ root: Optional[Any] = _NOT_SET,
+ base_url: Optional[str] = None,
+ _expr: Optional[str] = None,
+ huge_tree: bool = LXML_SUPPORTS_HUGE_TREE,
+ ) -> None:
+ if text:
+ body = bytes(text, encoding=encoding)
+ text = None
+ super().__init__(
+ text=text,
+ type=type,
+ body=body,
+ encoding=encoding,
+ namespaces=namespaces,
+ root=root,
+ base_url=base_url,
+ _expr=_expr,
+ huge_tree=huge_tree,
+ )
+
+
+class SelectorTestCaseBytes(SelectorTestCase):
+ sscls = SelectorBytesInput
+
+ def test_representation_slice(self) -> None:
+ pass
+
+ def test_representation_unicode_query(self) -> None:
+ pass
+
+ def test_weakref_slots(self) -> None:
+ pass
+
+ def test_check_text_argument_type(self) -> None:
+ self.assertRaisesRegex(
+ TypeError,
+ "body argument should be of type",
+ self.sscls,
+ body="",
+ )
+
+
+class ExsltTestCaseBytes(ExsltTestCase):
+ sscls = SelectorBytesInput
diff --git a/tests/test_selector_csstranslator.py b/tests/test_selector_csstranslator.py
index e2934d17..2adc2f5e 100644
--- a/tests/test_selector_csstranslator.py
+++ b/tests/test_selector_csstranslator.py
@@ -1,17 +1,18 @@
"""
Selector tests for cssselect backend
"""
+
import unittest
+from typing import Any, Callable, List, Protocol, Tuple, Type, Union
import cssselect
import pytest
-from packaging.version import Version
-
-from parsel.csstranslator import GenericTranslator, HTMLTranslator
-from parsel import Selector
from cssselect.parser import SelectorSyntaxError
from cssselect.xpath import ExpressionError
+from packaging.version import Version
+from parsel import Selector
+from parsel.csstranslator import GenericTranslator, HTMLTranslator, TranslatorProtocol
HTMLBODY = """
@@ -49,12 +50,32 @@
"""
+class TranslatorTestProtocol(Protocol):
+ tr_cls: Type[TranslatorProtocol]
+ tr: TranslatorProtocol
+
+ def c2x(self, css: str, prefix: str = ...) -> str:
+ pass
+
+ def assertEqual(self, first: Any, second: Any, msg: Any = ...) -> None:
+ pass
+
+ def assertRaises(
+ self,
+ expected_exception: Union[Type[BaseException], Tuple[Type[BaseException], ...]],
+ callable: Callable[..., object],
+ *args: Any,
+ **kwargs: Any,
+ ) -> None:
+ pass
+
+
class TranslatorTestMixin:
- def setUp(self):
+ def setUp(self: TranslatorTestProtocol) -> None:
self.tr = self.tr_cls()
self.c2x = self.tr.css_to_xpath
- def test_attr_function(self):
+ def test_attr_function(self: TranslatorTestProtocol) -> None:
cases = [
("::attr(name)", "descendant-or-self::*/@name"),
("a::attr(href)", "descendant-or-self::a/@href"),
@@ -67,7 +88,7 @@ def test_attr_function(self):
for css, xpath in cases:
self.assertEqual(self.c2x(css), xpath, css)
- def test_attr_function_exception(self):
+ def test_attr_function_exception(self: TranslatorTestProtocol) -> None:
cases = [
("::attr(12)", ExpressionError),
("::attr(34test)", ExpressionError),
@@ -76,7 +97,7 @@ def test_attr_function_exception(self):
for css, exc in cases:
self.assertRaises(exc, self.c2x, css)
- def test_text_pseudo_element(self):
+ def test_text_pseudo_element(self: TranslatorTestProtocol) -> None:
cases = [
("::text", "descendant-or-self::text()"),
("p::text", "descendant-or-self::p/text()"),
@@ -105,7 +126,7 @@ def test_text_pseudo_element(self):
for css, xpath in cases:
self.assertEqual(self.c2x(css), xpath, css)
- def test_pseudo_function_exception(self):
+ def test_pseudo_function_exception(self: TranslatorTestProtocol) -> None:
cases = [
("::attribute(12)", ExpressionError),
("::text()", ExpressionError),
@@ -114,14 +135,14 @@ def test_pseudo_function_exception(self):
for css, exc in cases:
self.assertRaises(exc, self.c2x, css)
- def test_unknown_pseudo_element(self):
+ def test_unknown_pseudo_element(self: TranslatorTestProtocol) -> None:
cases = [
("::text-node", ExpressionError),
]
for css, exc in cases:
self.assertRaises(exc, self.c2x, css)
- def test_unknown_pseudo_class(self):
+ def test_unknown_pseudo_class(self: TranslatorTestProtocol) -> None:
cases = [
(":text", ExpressionError),
(":attribute(name)", ExpressionError),
@@ -139,7 +160,7 @@ class GenericTranslatorTest(TranslatorTestMixin, unittest.TestCase):
class UtilCss2XPathTest(unittest.TestCase):
- def test_css2xpath(self):
+ def test_css2xpath(self) -> None:
from parsel import css2xpath
expected_xpath = (
@@ -153,15 +174,13 @@ class CSSSelectorTest(unittest.TestCase):
sscls = Selector
- def setUp(self):
+ def setUp(self) -> None:
self.sel = self.sscls(text=HTMLBODY)
- def x(self, *a, **kw):
- return [
- v.strip() for v in self.sel.css(*a, **kw).extract() if v.strip()
- ]
+ def x(self, *a: Any, **kw: Any) -> List[str]:
+ return [v.strip() for v in self.sel.css(*a, **kw).extract() if v.strip()]
- def test_selector_simple(self):
+ def test_selector_simple(self) -> None:
for x in self.sel.css("input"):
self.assertTrue(isinstance(x, self.sel.__class__), x)
self.assertEqual(
@@ -169,7 +188,7 @@ def test_selector_simple(self):
[x.extract() for x in self.sel.css("input")],
)
- def test_text_pseudo_element(self):
+ def test_text_pseudo_element(self) -> None:
self.assertEqual(self.x("#p-b2"), ['guy'])
self.assertEqual(self.x("#p-b2::text"), ["guy"])
self.assertEqual(self.x("#p-b2 ::text"), ["guy"])
@@ -179,11 +198,9 @@ def test_text_pseudo_element(self):
["lorem ipsum text", "hi", "there", "guy"],
)
self.assertEqual(self.x("p::text"), ["lorem ipsum text"])
- self.assertEqual(
- self.x("p ::text"), ["lorem ipsum text", "hi", "there", "guy"]
- )
+ self.assertEqual(self.x("p ::text"), ["lorem ipsum text", "hi", "there", "guy"])
- def test_attribute_function(self):
+ def test_attribute_function(self) -> None:
self.assertEqual(self.x("#p-b2::attr(id)"), ["p-b2"])
self.assertEqual(self.x(".cool-footer::attr(class)"), ["cool-footer"])
self.assertEqual(
@@ -193,10 +210,8 @@ def test_attribute_function(self):
self.x('map[name="dummymap"] ::attr(shape)'), ["circle", "default"]
)
- def test_nested_selector(self):
- self.assertEqual(
- self.sel.css("p").css("b::text").extract(), ["hi", "guy"]
- )
+ def test_nested_selector(self) -> None:
+ self.assertEqual(self.sel.css("p").css("b::text").extract(), ["hi", "guy"])
self.assertEqual(
self.sel.css("div").css("area:last-child").extract(),
[''],
@@ -206,5 +221,10 @@ def test_nested_selector(self):
Version(cssselect.__version__) < Version("1.2.0"),
reason="Support added in cssselect 1.2.0",
)
- def test_pseudoclass_has(self):
+ def test_pseudoclass_has(self) -> None:
self.assertEqual(self.x("p:has(b)::text"), ["lorem ipsum text"])
+
+
+class CSSSelectorTestBytes(CSSSelectorTest):
+ def setUp(self) -> None:
+ self.sel = self.sscls(body=bytes(HTMLBODY, encoding="utf8"))
diff --git a/tests/test_selector_jmespath.py b/tests/test_selector_jmespath.py
new file mode 100644
index 00000000..5afbd6d0
--- /dev/null
+++ b/tests/test_selector_jmespath.py
@@ -0,0 +1,170 @@
+# -*- coding: utf-8 -*-
+
+import unittest
+
+from parsel import Selector
+from parsel.selector import _NOT_SET
+
+
+class JMESPathTestCase(unittest.TestCase):
+ def test_json_has_html(self) -> None:
+ """Sometimes the information is returned in a json wrapper"""
+ data = """
+ {
+ "content": [
+ {
+ "name": "A",
+ "value": "a"
+ },
+ {
+ "name": {
+ "age": 18
+ },
+ "value": "b"
+ },
+ {
+ "name": "C",
+ "value": "c"
+ },
+ {
+ "name": "D",
+ "value": "d
"
+ }
+ ],
+ "html": ""
+ }
+ """
+ sel = Selector(text=data)
+ self.assertEqual(
+ sel.jmespath("html").get(),
+ "",
+ )
+ self.assertEqual(
+ sel.jmespath("html").xpath("//div/a/text()").getall(),
+ ["a", "b", "d"],
+ )
+ self.assertEqual(sel.jmespath("html").css("div > b").getall(), ["f"])
+ self.assertEqual(sel.jmespath("content").jmespath("name.age").get(), 18)
+
+ def test_html_has_json(self) -> None:
+ html_text = """
+
+
Information
+
+ {
+ "user": [
+ {
+ "name": "A",
+ "age": 18
+ },
+ {
+ "name": "B",
+ "age": 32
+ },
+ {
+ "name": "C",
+ "age": 22
+ },
+ {
+ "name": "D",
+ "age": 25
+ }
+ ],
+ "total": 4,
+ "status": "ok"
+ }
+
+
+ """
+ sel = Selector(text=html_text)
+ self.assertEqual(
+ sel.xpath("//div/content/text()").jmespath("user[*].name").getall(),
+ ["A", "B", "C", "D"],
+ )
+ self.assertEqual(
+ sel.xpath("//div/content").jmespath("user[*].name").getall(),
+ ["A", "B", "C", "D"],
+ )
+ self.assertEqual(sel.xpath("//div/content").jmespath("total").get(), 4)
+
+ def test_jmestpath_with_re(self) -> None:
+ html_text = """
+
+
Information
+
+ {
+ "user": [
+ {
+ "name": "A",
+ "age": 18
+ },
+ {
+ "name": "B",
+ "age": 32
+ },
+ {
+ "name": "C",
+ "age": 22
+ },
+ {
+ "name": "D",
+ "age": 25
+ }
+ ],
+ "total": 4,
+ "status": "ok"
+ }
+
+
+ """
+ sel = Selector(text=html_text)
+ self.assertEqual(
+ sel.xpath("//div/content/text()").jmespath("user[*].name").re(r"(\w+)"),
+ ["A", "B", "C", "D"],
+ )
+ self.assertEqual(
+ sel.xpath("//div/content").jmespath("user[*].name").re(r"(\w+)"),
+ ["A", "B", "C", "D"],
+ )
+
+ with self.assertRaises(TypeError):
+ sel.xpath("//div/content").jmespath("user[*].age").re(r"(\d+)")
+
+ self.assertEqual(
+ sel.xpath("//div/content").jmespath("unavailable").re(r"(\d+)"), []
+ )
+
+ self.assertEqual(
+ sel.xpath("//div/content").jmespath("unavailable").re_first(r"(\d+)"),
+ None,
+ )
+
+ self.assertEqual(
+ sel.xpath("//div/content")
+ .jmespath("user[*].age.to_string(@)")
+ .re(r"(\d+)"),
+ ["18", "32", "22", "25"],
+ )
+
+ def test_json_types(self) -> None:
+ for text, root in (
+ ("{}", {}),
+ ('{"a": "b"}', {"a": "b"}),
+ ("[]", []),
+ ('["a"]', ["a"]),
+ ('""', ""),
+ ("0", 0),
+ ("1", 1),
+ ("true", True),
+ ("false", False),
+ ("null", None),
+ ):
+ selector = Selector(text=text, root=_NOT_SET)
+ self.assertEqual(selector.type, "json")
+ self.assertEqual(selector._text, text) # pylint: disable=protected-access
+ self.assertEqual(selector.root, root)
+
+ selector = Selector(text=None, root=root)
+ self.assertEqual(selector.type, "json")
+ self.assertEqual(selector._text, None) # pylint: disable=protected-access
+ self.assertEqual(selector.root, root)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index e2bca559..ee3e1121 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,7 +1,9 @@
-from parsel.utils import shorten, extract_regex
+from typing import List, Pattern, Type, Union
from pytest import mark, raises
+from parsel.utils import extract_regex, shorten
+
@mark.parametrize(
"width,expected",
@@ -17,7 +19,7 @@
(7, "foobar"),
),
)
-def test_shorten(width, expected):
+def test_shorten(width: int, expected: Union[str, Type[Exception]]) -> None:
if isinstance(expected, str):
assert shorten("foobar", width) == expected
else:
@@ -66,5 +68,10 @@ def test_shorten(width, expected):
],
),
)
-def test_extract_regex(regex, text, replace_entities, expected):
+def test_extract_regex(
+ regex: Union[str, Pattern[str]],
+ text: str,
+ replace_entities: bool,
+ expected: List[str],
+) -> None:
assert extract_regex(regex, text, replace_entities) == expected
diff --git a/tests/test_xml_attacks.py b/tests/test_xml_attacks.py
index 45b0243a..e7b5a486 100644
--- a/tests/test_xml_attacks.py
+++ b/tests/test_xml_attacks.py
@@ -7,11 +7,10 @@
from parsel import Selector
-
MiB_1 = 1024**2
-def _load(attack):
+def _load(attack: str) -> str:
folder_path = path.dirname(__file__)
file_path = path.join(folder_path, "xml_attacks", f"{attack}.xml")
with open(file_path, "rb") as attack_file:
@@ -21,7 +20,7 @@ def _load(attack):
# List of known attacks:
# https://github.com/tiran/defusedxml#python-xml-libraries
class XMLAttackTestCase(TestCase):
- def test_billion_laughs(self):
+ def test_billion_laughs(self) -> None:
process = Process()
memory_usage_before = process.memory_info().rss
selector = Selector(text=_load("billion_laughs"))
diff --git a/tests/test_xpathfuncs.py b/tests/test_xpathfuncs.py
index 744472a9..7739982d 100644
--- a/tests/test_xpathfuncs.py
+++ b/tests/test_xpathfuncs.py
@@ -1,10 +1,12 @@
+import unittest
+from typing import Any
+
from parsel import Selector
from parsel.xpathfuncs import set_xpathfunc
-import unittest
class XPathFuncsTestCase(unittest.TestCase):
- def test_has_class_simple(self):
+ def test_has_class_simple(self) -> None:
body = """
First
Second
@@ -21,21 +23,15 @@ def test_has_class_simple(self):
["Third"],
)
self.assertEqual(
- [
- x.extract()
- for x in sel.xpath('//p[has-class("foo","bar")]/text()')
- ],
+ [x.extract() for x in sel.xpath('//p[has-class("foo","bar")]/text()')],
[],
)
self.assertEqual(
- [
- x.extract()
- for x in sel.xpath('//p[has-class("foo","bar-baz")]/text()')
- ],
+ [x.extract() for x in sel.xpath('//p[has-class("foo","bar-baz")]/text()')],
["First"],
)
- def test_has_class_error_no_args(self):
+ def test_has_class_error_no_args(self) -> None:
body = """
First
"""
@@ -47,7 +43,7 @@ def test_has_class_error_no_args(self):
"has-class()",
)
- def test_has_class_error_invalid_arg_type(self):
+ def test_has_class_error_invalid_arg_type(self) -> None:
body = """
First
"""
@@ -59,7 +55,7 @@ def test_has_class_error_invalid_arg_type(self):
"has-class(.)",
)
- def test_has_class_error_invalid_unicode(self):
+ def test_has_class_error_invalid_unicode(self) -> None:
body = """
First
"""
@@ -71,7 +67,7 @@ def test_has_class_error_invalid_unicode(self):
'has-class("héllö")'.encode(),
)
- def test_has_class_unicode(self):
+ def test_has_class_unicode(self) -> None:
body = """
First
"""
@@ -81,7 +77,7 @@ def test_has_class_unicode(self):
["First"],
)
- def test_has_class_uppercase(self):
+ def test_has_class_uppercase(self) -> None:
body = """
First
"""
@@ -91,7 +87,7 @@ def test_has_class_uppercase(self):
["First"],
)
- def test_has_class_newline(self):
+ def test_has_class_newline(self) -> None:
body = """
First
@@ -102,7 +98,7 @@ def test_has_class_newline(self):
["First"],
)
- def test_has_class_tab(self):
+ def test_has_class_tab(self) -> None:
body = """
First
"""
@@ -112,11 +108,11 @@ def test_has_class_tab(self):
["First"],
)
- def test_set_xpathfunc(self):
- def myfunc(ctx):
- myfunc.call_count += 1
+ def test_set_xpathfunc(self) -> None:
+ def myfunc(ctx: Any) -> None:
+ myfunc.call_count += 1 # type: ignore[attr-defined]
- myfunc.call_count = 0
+ myfunc.call_count = 0 # type: ignore[attr-defined]
body = """
First
@@ -131,7 +127,7 @@ def myfunc(ctx):
set_xpathfunc("myfunc", myfunc)
sel.xpath("myfunc()")
- self.assertEqual(myfunc.call_count, 1)
+ self.assertEqual(myfunc.call_count, 1) # type: ignore[attr-defined]
set_xpathfunc("myfunc", None)
self.assertRaisesRegex(
diff --git a/tests/typing/selector.py b/tests/typing/selector.py
index 72310634..b6f14345 100644
--- a/tests/typing/selector.py
+++ b/tests/typing/selector.py
@@ -1,6 +1,7 @@
# Basic usage of the Selector, strongly typed to test the typing of parsel's API.
import re
from typing import List
+
from parsel import Selector
diff --git a/tox.ini b/tox.ini
index e5e5f47e..893b8de7 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,5 +1,5 @@
[tox]
-envlist = security,flake8,typing,pylint,black,docs,py37,py38,py39,py310,pypy3.9
+envlist = typing,pylint,docs,twinecheck,pre-commit,py38,py39,py310,py311,py312,pypy3.9,pypy3.10
[testenv]
usedevelop = True
@@ -7,42 +7,25 @@ deps =
-r{toxinidir}/tests/requirements.txt
commands = py.test --cov=parsel --cov-report=xml {posargs:docs parsel tests}
-[testenv:security]
-deps =
- bandit
-commands =
- bandit -r -c .bandit.yml {posargs:parsel}
-
-[testenv:flake8]
-deps =
- {[testenv]deps}
- flake8==5.0.4
-commands =
- flake8 {posargs: parsel tests setup.py}
-
[testenv:typing]
deps =
{[testenv]deps}
- types-lxml==2022.4.10
- types-psutil==5.9.5.4
- types-setuptools==65.5.0.1
- mypy==0.982
+ types-jmespath==1.0.2.20240106
+ types-lxml==2024.2.9
+ types-psutil==5.9.5.20240311
+ types-setuptools==69.1.0.20240310
+ py==1.11.0
+ mypy==1.9.0
commands =
- mypy {posargs:parsel tests} --warn-unused-ignores --show-error-codes
+ mypy {posargs:parsel tests} --strict
[testenv:pylint]
deps =
{[testenv]deps}
- pylint==2.15.4
+ pylint==3.1.0
commands =
pylint docs parsel tests setup.py
-[testenv:black]
-deps =
- black==22.10.0
-commands =
- black --line-length=79 --check {posargs:parsel tests setup.py}
-
[docs]
changedir = docs
deps = -rdocs/requirements.txt
@@ -55,4 +38,18 @@ deps = {[docs]deps}
commands =
sphinx-build -W -b html . {envtmpdir}/html
sphinx-build -b latex . {envtmpdir}/latex
- sphinx-build -W -b epub . {envtmpdir}/epub
+ sphinx-build -b epub . {envtmpdir}/epub
+
+[testenv:twinecheck]
+basepython = python3
+deps =
+ twine==5.0.0
+ build==1.1.1
+commands =
+ python -m build --sdist
+ twine check dist/*
+
+[testenv:pre-commit]
+deps = pre-commit
+commands = pre-commit run --all-files --show-diff-on-failure
+skip_install = true