Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for regex flags in .re() and .re_first() methods #225

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions docs/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,15 @@ Use it to extract just the first matching string::
>>> selector.xpath('//a[contains(@href, "image")]/text()').re_first(r'Name:\s*(.*)')
'My image 1 '

You can also use compiled regular expressions with both methods::

>>> import re
>>> regex = re.compile(r'Name:\s*(.*)')
>>> selector.xpath('//a[contains(@href, "image")]/text()').re_first(regex)
'My image 1 '

As well as adding regex flags with the ``flags`` argument.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this needs an example and/or a link to the Python doc about the flags.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would also be OK with just appending (see :mod:`re`) to the first sentence of this entire section, right after the first mention of “regular expressions”, since at that point already users may wonder about regular expressions.


.. _topics-selectors-relative-xpaths:

Working with relative XPaths
Expand Down
43 changes: 36 additions & 7 deletions parsel/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,10 @@ def css(self, query: str) -> "SelectorList[_SelectorType]":
return self.__class__(flatten([x.css(query) for x in self]))

def re(
self, regex: Union[str, Pattern[str]], replace_entities: bool = True
self,
regex: Union[str, Pattern[str]],
replace_entities: bool = True,
flags: int = 0,
) -> List[str]:
"""
Call the ``.re()`` method for each element in this list and return
Expand All @@ -129,15 +132,22 @@ def re(
corresponding character (except for ``&`` and ``<``.
Passing ``replace_entities`` as ``False`` switches off these
replacements.

It is possible to provide regex flags using the `flags` argument. They
will be applied only if the provided regex is not a compiled regular
expression.
"""
return flatten([x.re(regex, replace_entities=replace_entities) for x in self])
return flatten(
[x.re(regex, replace_entities=replace_entities, flags=flags) for x in self]
)

@typing.overload
def re_first(
self,
regex: Union[str, Pattern[str]],
default: None = None,
replace_entities: bool = True,
flags: int = 0,
) -> Optional[str]:
pass

Expand All @@ -147,6 +157,7 @@ def re_first(
regex: Union[str, Pattern[str]],
default: str,
replace_entities: bool = True,
flags: int = 0,
) -> str:
pass

Expand All @@ -155,6 +166,7 @@ def re_first(
regex: Union[str, Pattern[str]],
default: Optional[str] = None,
replace_entities: bool = True,
flags: int = 0,
) -> Optional[str]:
"""
Call the ``.re()`` method for the first element in this list and
Expand All @@ -168,7 +180,7 @@ def re_first(
replacements.
"""
for el in iflatten(
x.re(regex, replace_entities=replace_entities) for x in self
x.re(regex, replace_entities=replace_entities, flags=flags) for x in self
):
return el
return default
Expand Down Expand Up @@ -358,28 +370,38 @@ def _css2xpath(self, query: str) -> Any:
return self._csstranslator.css_to_xpath(query)

def re(
self, regex: Union[str, Pattern[str]], replace_entities: bool = True
self,
regex: Union[str, Pattern[str]],
replace_entities: bool = True,
flags: int = 0,
) -> List[str]:
"""
Apply the given regex and return a list of unicode strings with the
matches.

``regex`` can be either a compiled regular expression or a string which
will be compiled to a regular expression using ``re.compile(regex)``.
will be compiled to a regular expression using ``re.compile()``.
Comment on lines 382 to +383
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assuming we don’t actually compile regular expressions, which could be counter-productive performance-wise given Python’s regular expression caching, what about some rewording instead?

        ``regex`` is a regular expression (see :mod:`re`), either as a string 
        or compiled.


By default, character entity references are replaced by their
corresponding character (except for ``&`` and ``<``).
Passing ``replace_entities`` as ``False`` switches off these
replacements.

It is possible to provide regex flags using the `flags` argument. They
will be applied only if the provided regex is not a compiled regular
expression.
"""
return extract_regex(regex, self.get(), replace_entities=replace_entities)
return extract_regex(
regex, self.get(), replace_entities=replace_entities, flags=flags
)

@typing.overload
def re_first(
self,
regex: Union[str, Pattern[str]],
default: None = None,
replace_entities: bool = True,
flags: int = 0,
) -> Optional[str]:
pass

Expand All @@ -389,6 +411,7 @@ def re_first(
regex: Union[str, Pattern[str]],
default: str,
replace_entities: bool = True,
flags: int = 0,
) -> str:
pass

Expand All @@ -397,6 +420,7 @@ def re_first(
regex: Union[str, Pattern[str]],
default: Optional[str] = None,
replace_entities: bool = True,
flags: int = 0,
) -> Optional[str]:
"""
Apply the given regex and return the first unicode string which
Expand All @@ -407,9 +431,14 @@ def re_first(
corresponding character (except for ``&`` and ``<``).
Passing ``replace_entities`` as ``False`` switches off these
replacements.

It is possible to provide regex flags using the `flags` argument. They
will be applied only if the provided regex is not a compiled regular
expression.
"""
return next(
iflatten(self.re(regex, replace_entities=replace_entities)), default
iflatten(self.re(regex, replace_entities=replace_entities, flags=flags)),
default,
)

def get(self) -> str:
Expand Down
9 changes: 7 additions & 2 deletions parsel/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,20 @@ def _is_listlike(x: Any) -> bool:


def extract_regex(
regex: Union[str, Pattern[str]], text: str, replace_entities: bool = True
regex: Union[str, Pattern[str]],
text: str,
replace_entities: bool = True,
flags: int = 0,
) -> List[str]:
"""Extract a list of unicode strings from the given text/encoding using the following policies:
* if the regex is a string it will be compiled using the provided flags
* if the regex contains a named group called "extract" that will be returned
* if the regex contains multiple numbered groups, all those will be returned (flattened)
* if the regex doesn't contain any group the entire regex matching is returned
"""
if isinstance(regex, str):
regex = re.compile(regex, re.UNICODE)
flags |= re.UNICODE
regex = re.compile(regex, flags)
Copy link
Member Author

@noviluni noviluni Aug 7, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I keep the re.UNICODE to respect the old behavior, especially because in some docstrings you can find:

Apply the given regex and return the first unicode string which matches

However, this also means that it won't be possible to override this, and the re.UNICODE will be always applied. I don't think this is a big issue and can be changed in the future, for example when deprecating Python 2.7.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've seen that Python 2.7 has been already deprecated...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Unicode flag also applies to Python 3, though, is not about the Python 2 strings but about pattern behavior.

On the other hand, I see that we actually compile strings into patterns here. It’s out of the scope of this change, so feel free to ignore, but I wonder if, instead of compiling the expressions, we could pass them (and flags) to the corresponding functions. That may even allow flags to work when passed along a compiled regular expression.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Unicode flag also applies to Python 3, though, is not about the Python 2 strings but about pattern behavior.

Sorry, I think I wasn't clear enough and mixed things. What I meant was that we could do something like:

regex = re.compile(regex, flags or re.UNICODE)

or doing what I did (always applying re.UNICODE). The first approach allows us to override the re.UNICODE, but if we were applying re.UNICODE always, it could be confusing to don't apply it when using other flags. The reference I made to Python 2.7 was that we could change this in the future but it would be a breaking change, however, it could come along with the Python 2.7 deprecation that would require a new major version. I didn't know it had been already deprecated before; after I saw that, I considered that maybe we need to think better about this decision.

On the other hand, I see that we actually compile strings into patterns here. It’s out of the scope of this change, so feel free to ignore, but I wonder if, instead of compiling the expressions, we could pass them (and flags) to the corresponding functions. That may even allow flags to work when passed along a compiled regular expression.

It seems a good idea, but if it's not required I would love to keep this PR as-is (shorter). We can open a new issue if you want :)


if "extract" in regex.groupindex:
# named group
Expand Down
49 changes: 47 additions & 2 deletions tests/test_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ def test_re_first(self) -> None:
self.assertEqual(sel.re_first(r"foo"), None)
self.assertEqual(sel.re_first(r"foo", default="bar"), "bar")

def test_extract_first_re_default(self) -> None:
def test_re_first_default(self) -> None:
"""Test if re_first() returns default value when no results found"""
body = '<ul><li id="1">1</li><li id="2">2</li></ul>'
sel = self.sscls(text=body)
Expand All @@ -330,6 +330,30 @@ def test_extract_first_re_default(self) -> None:
sel.xpath("/ul/li/text()").re_first(r"\w+", default="missing"), "missing"
)

def test_re_first_flags(self) -> None:
body = """
<script>
function example() {
"name": "Adrian",
"points": 3,
}
</script>
"""
sel = self.sscls(text=body)

self.assertEqual(
sel.xpath("//script/text()").re_first(r"example\(\) ({.*})"), None
)
self.assertEqual(
sel.xpath("//script/text()").re_first(
r"example\(\) ({.*})", flags=re.DOTALL
),
"""{
"name": "Adrian",
"points": 3,
}""",
)

def test_select_unicode_query(self) -> None:
body = "<p><input name='\xa9' value='1'/></p>"
sel = self.sscls(text=body)
Expand Down Expand Up @@ -710,7 +734,6 @@ def test_re_replace_entities(self) -> None:
self.assertEqual(
x.xpath("//script")[0].re(name_re, replace_entities=False), [expected]
)

self.assertEqual(
x.xpath("//script/text()").re_first(name_re, replace_entities=False),
expected,
Expand All @@ -724,6 +747,28 @@ def test_re_intl(self) -> None:
x = self.sscls(text=body)
self.assertEqual(x.xpath("//div").re(r"Evento: (\w+)"), ["cumplea\xf1os"])

def test_re_flags(self) -> None:
body = """
<script>
function example() {
"name": "Adrian",
"points": 3,
}
</script>
"""
sel = self.sscls(text=body)

self.assertEqual(sel.xpath("//script/text()").re(r"example\(\) ({.*})"), [])
self.assertEqual(
sel.xpath("//script/text()").re(r"example\(\) ({.*})", flags=re.DOTALL),
[
"""{
"name": "Adrian",
"points": 3,
}"""
],
)

noviluni marked this conversation as resolved.
Show resolved Hide resolved
def test_selector_over_text(self) -> None:
hs = self.sscls(text="<root>lala</root>")
self.assertEqual(hs.extract(), "<html><body><root>lala</root></body></html>")
Expand Down