scrapy · noviluni · Aug 7, 2021 · Aug 7, 2021 · wRAR · Aug 9, 2021
diff --git a/docs/usage.rst b/docs/usage.rst
@@ -359,6 +359,15 @@ Use it to extract just the first matching string::
     >>> selector.xpath('//a[contains(@href, "image")]/text()').re_first(r'Name:\s*(.*)')
     'My image 1 '
 
+You can also use compiled regular expressions with both methods::
+
+    >>> import re
+    >>> regex = re.compile(r'Name:\s*(.*)')
+    >>> selector.xpath('//a[contains(@href, "image")]/text()').re_first(regex)
+    'My image 1 '
+
+As well as adding regex flags with the ``flags`` argument.
+
 .. _topics-selectors-relative-xpaths:
 
 Working with relative XPaths

diff --git a/parsel/selector.py b/parsel/selector.py
@@ -119,7 +119,10 @@ def css(self, query: str) -> "SelectorList[_SelectorType]":
         return self.__class__(flatten([x.css(query) for x in self]))
 
     def re(
-        self, regex: Union[str, Pattern[str]], replace_entities: bool = True
+        self,
+        regex: Union[str, Pattern[str]],
+        replace_entities: bool = True,
+        flags: int = 0,
     ) -> List[str]:
         """
         Call the ``.re()`` method for each element in this list and return
@@ -129,15 +132,22 @@ def re(
         corresponding character (except for ``&amp;`` and ``&lt;``.
         Passing ``replace_entities`` as ``False`` switches off these
         replacements.
+
+        It is possible to provide regex flags using the `flags` argument. They
+        will be applied only if the provided regex is not a compiled regular
+        expression.
         """
-        return flatten([x.re(regex, replace_entities=replace_entities) for x in self])
+        return flatten(
+            [x.re(regex, replace_entities=replace_entities, flags=flags) for x in self]
+        )
 
     @typing.overload
     def re_first(
         self,
         regex: Union[str, Pattern[str]],
         default: None = None,
         replace_entities: bool = True,
+        flags: int = 0,
     ) -> Optional[str]:
         pass
 
@@ -147,6 +157,7 @@ def re_first(
         regex: Union[str, Pattern[str]],
         default: str,
         replace_entities: bool = True,
+        flags: int = 0,
     ) -> str:
         pass
 
@@ -155,6 +166,7 @@ def re_first(
         regex: Union[str, Pattern[str]],
         default: Optional[str] = None,
         replace_entities: bool = True,
+        flags: int = 0,
     ) -> Optional[str]:
         """
         Call the ``.re()`` method for the first element in this list and
@@ -168,7 +180,7 @@ def re_first(
         replacements.
         """
         for el in iflatten(
-            x.re(regex, replace_entities=replace_entities) for x in self
+            x.re(regex, replace_entities=replace_entities, flags=flags) for x in self
         ):
             return el
         return default
@@ -358,28 +370,38 @@ def _css2xpath(self, query: str) -> Any:
         return self._csstranslator.css_to_xpath(query)
 
     def re(
-        self, regex: Union[str, Pattern[str]], replace_entities: bool = True
+        self,
+        regex: Union[str, Pattern[str]],
+        replace_entities: bool = True,
+        flags: int = 0,
     ) -> List[str]:
         """
         Apply the given regex and return a list of unicode strings with the
         matches.
 
         ``regex`` can be either a compiled regular expression or a string which
-        will be compiled to a regular expression using ``re.compile(regex)``.
+        will be compiled to a regular expression using ``re.compile()``.
 
         By default, character entity references are replaced by their
         corresponding character (except for ``&amp;`` and ``&lt;``).
         Passing ``replace_entities`` as ``False`` switches off these
         replacements.
+
+        It is possible to provide regex flags using the `flags` argument. They
+        will be applied only if the provided regex is not a compiled regular
+        expression.
         """
-        return extract_regex(regex, self.get(), replace_entities=replace_entities)
+        return extract_regex(
+            regex, self.get(), replace_entities=replace_entities, flags=flags
+        )
 
     @typing.overload
     def re_first(
         self,
         regex: Union[str, Pattern[str]],
         default: None = None,
         replace_entities: bool = True,
+        flags: int = 0,
     ) -> Optional[str]:
         pass
 
@@ -389,6 +411,7 @@ def re_first(
         regex: Union[str, Pattern[str]],
         default: str,
         replace_entities: bool = True,
+        flags: int = 0,
     ) -> str:
         pass
 
@@ -397,6 +420,7 @@ def re_first(
         regex: Union[str, Pattern[str]],
         default: Optional[str] = None,
         replace_entities: bool = True,
+        flags: int = 0,
     ) -> Optional[str]:
         """
         Apply the given regex and return the first unicode string which
@@ -407,9 +431,14 @@ def re_first(
         corresponding character (except for ``&amp;`` and ``&lt;``).
         Passing ``replace_entities`` as ``False`` switches off these
         replacements.
+
+        It is possible to provide regex flags using the `flags` argument. They
+        will be applied only if the provided regex is not a compiled regular
+        expression.
         """
         return next(
-            iflatten(self.re(regex, replace_entities=replace_entities)), default
+            iflatten(self.re(regex, replace_entities=replace_entities, flags=flags)),
+            default,
         )
 
     def get(self) -> str:

diff --git a/parsel/utils.py b/parsel/utils.py
@@ -57,15 +57,20 @@ def _is_listlike(x: Any) -> bool:
 
 
 def extract_regex(
-    regex: Union[str, Pattern[str]], text: str, replace_entities: bool = True
+    regex: Union[str, Pattern[str]],
+    text: str,
+    replace_entities: bool = True,
+    flags: int = 0,
 ) -> List[str]:
     """Extract a list of unicode strings from the given text/encoding using the following policies:
+    * if the regex is a string it will be compiled using the provided flags
     * if the regex contains a named group called "extract" that will be returned
     * if the regex contains multiple numbered groups, all those will be returned (flattened)
     * if the regex doesn't contain any group the entire regex matching is returned
     """
     if isinstance(regex, str):
-        regex = re.compile(regex, re.UNICODE)
+        flags |= re.UNICODE
+        regex = re.compile(regex, flags)
 
     if "extract" in regex.groupindex:
         # named group

diff --git a/tests/test_selector.py b/tests/test_selector.py
@@ -318,7 +318,7 @@ def test_re_first(self) -> None:
         self.assertEqual(sel.re_first(r"foo"), None)
         self.assertEqual(sel.re_first(r"foo", default="bar"), "bar")
 
-    def test_extract_first_re_default(self) -> None:
+    def test_re_first_default(self) -> None:
         """Test if re_first() returns default value when no results found"""
         body = '<ul><li id="1">1</li><li id="2">2</li></ul>'
         sel = self.sscls(text=body)
@@ -330,6 +330,30 @@ def test_extract_first_re_default(self) -> None:
             sel.xpath("/ul/li/text()").re_first(r"\w+", default="missing"), "missing"
         )
 
+    def test_re_first_flags(self) -> None:
+        body = """
+        <script>
+            function example() {
+            "name": "Adrian",
+            "points": 3,
+            }
+        </script>
+        """
+        sel = self.sscls(text=body)
+
+        self.assertEqual(
+            sel.xpath("//script/text()").re_first(r"example\(\) ({.*})"), None
+        )
+        self.assertEqual(
+            sel.xpath("//script/text()").re_first(
+                r"example\(\) ({.*})", flags=re.DOTALL
+            ),
+            """{
+            "name": "Adrian",
+            "points": 3,
+            }""",
+        )
+
     def test_select_unicode_query(self) -> None:
         body = "<p><input name='\xa9' value='1'/></p>"
         sel = self.sscls(text=body)
@@ -710,7 +734,6 @@ def test_re_replace_entities(self) -> None:
         self.assertEqual(
             x.xpath("//script")[0].re(name_re, replace_entities=False), [expected]
         )
-
         self.assertEqual(
             x.xpath("//script/text()").re_first(name_re, replace_entities=False),
             expected,
@@ -724,6 +747,28 @@ def test_re_intl(self) -> None:
         x = self.sscls(text=body)
         self.assertEqual(x.xpath("//div").re(r"Evento: (\w+)"), ["cumplea\xf1os"])
 
+    def test_re_flags(self) -> None:
+        body = """
+        <script>
+            function example() {
+            "name": "Adrian",
+            "points": 3,
+            }
+        </script>
+        """
+        sel = self.sscls(text=body)
+
+        self.assertEqual(sel.xpath("//script/text()").re(r"example\(\) ({.*})"), [])
+        self.assertEqual(
+            sel.xpath("//script/text()").re(r"example\(\) ({.*})", flags=re.DOTALL),
+            [
+                """{
+            "name": "Adrian",
+            "points": 3,
+            }"""
+            ],
+        )
+
     def test_selector_over_text(self) -> None:
         hs = self.sscls(text="<root>lala</root>")
         self.assertEqual(hs.extract(), "<html><body><root>lala</root></body></html>")