From 3b4709739f617b568abf7e8fe4730ebf8e1eee4c Mon Sep 17 00:00:00 2001 From: Victor Ruiz Date: Wed, 21 Aug 2024 08:43:36 +0200 Subject: [PATCH 1/6] Add LocationSessionConfig with docstrings --- scrapy_zyte_api/_session.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/scrapy_zyte_api/_session.py b/scrapy_zyte_api/_session.py index cac137c5..8eea7fa5 100644 --- a/scrapy_zyte_api/_session.py +++ b/scrapy_zyte_api/_session.py @@ -966,3 +966,31 @@ async def process_exception( spider=spider, reason=reason, ) + + +class LocationSessionConfig(SessionConfig): + """Location-based session configuration for :ref:`scrapy-zyte-api sessions + `. The aim of this class is to reduce boilerplate code. + If not location is provided, it defaults to :func:~scrapy_zyte_api._session.SessionConfig.params` + and :func:~scrapy_zyte_api._session.SessionConfig.check` methods. Otherwise, it uses the location + related logic define by :func:~scrapy_zyte_api._session.LocationSessionConfig.location_params` + and :func:~scrapy_zyte_api._session.LocationSessionConfig.location_checks`. + """ + + def params(self, request): + if not (location := self.location(request)): + return super().params(request) + return self.location_params(request, location) + + def check(self, response, request): + if not (location := self.location(request)): + return super().check(response, request) + return self.location_check(response, request, location) + + def location_params(self, request, location): + """Method to add params location-specific. Similar to :func:`~scrapy_zyte_api._session.SessionConfig.check`""" + return super().params(request) + + def location_check(self, response, request, location): + """Method to check location-specific conditions. Similar to :func:`~scrapy_zyte_api._session.SessionConfig.check`""" + return super().check(response, request) From 5818fb0d306659b41c426ea8c1df6c430f993404 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 26 Aug 2024 17:36:12 +0200 Subject: [PATCH 2/6] Improve docs --- docs/usage/session.rst | 9 +++++++++ scrapy_zyte_api/__init__.py | 1 + scrapy_zyte_api/_session.py | 25 +++++++++++++++++-------- 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/docs/usage/session.rst b/docs/usage/session.rst index 09a7d792..8e3aad61 100644 --- a/docs/usage/session.rst +++ b/docs/usage/session.rst @@ -338,6 +338,15 @@ To define a different session config for a given URL pattern, install .. autofunction:: scrapy_zyte_api.session_config +If you only need to override the :meth:`SessionConfig.check +` or :meth:`SessionConfig.params +` methods for scenarios involving a +location, you may subclass :class:`~scrapy_zyte_api.LocationSessionConfig` +instead: + +.. autoclass:: scrapy_zyte_api.LocationSessionConfig + :members: location_check, location_params + If in a session config implementation or in any other Scrapy component you need to tell whether a request is a :ref:`session initialization request ` or not, use :func:`~scrapy_zyte_api.is_session_init_request`: diff --git a/scrapy_zyte_api/__init__.py b/scrapy_zyte_api/__init__.py index de1ff8e1..88bec2fb 100644 --- a/scrapy_zyte_api/__init__.py +++ b/scrapy_zyte_api/__init__.py @@ -17,6 +17,7 @@ ) from ._session import SESSION_DEFAULT_RETRY_POLICY as _SESSION_DEFAULT_RETRY_POLICY from ._session import ( + LocationSessionConfig, ScrapyZyteAPISessionDownloaderMiddleware, SessionConfig, is_session_init_request, diff --git a/scrapy_zyte_api/_session.py b/scrapy_zyte_api/_session.py index 8eea7fa5..a6e927e7 100644 --- a/scrapy_zyte_api/_session.py +++ b/scrapy_zyte_api/_session.py @@ -348,6 +348,8 @@ def params(self, request: Request) -> Dict[str, Any]: The returned parameters do not need to include :http:`request:url`. If missing, it is picked from the request :ref:`triggering a session initialization request `. + + .. seealso:: :class:`~scrapy_zyte_api.LocationSessionConfig` """ if location := self.location(request): return { @@ -372,6 +374,8 @@ def check(self, response: Response, request: Request) -> bool: If you need to tell whether *request* is a :ref:`session initialization request ` or not, use :func:`~scrapy_zyte_api.is_session_init_request`. + + .. seealso:: :class:`~scrapy_zyte_api.LocationSessionConfig` """ if self._checker: return self._checker.check(response, request) @@ -969,12 +973,13 @@ async def process_exception( class LocationSessionConfig(SessionConfig): - """Location-based session configuration for :ref:`scrapy-zyte-api sessions - `. The aim of this class is to reduce boilerplate code. - If not location is provided, it defaults to :func:~scrapy_zyte_api._session.SessionConfig.params` - and :func:~scrapy_zyte_api._session.SessionConfig.check` methods. Otherwise, it uses the location - related logic define by :func:~scrapy_zyte_api._session.LocationSessionConfig.location_params` - and :func:~scrapy_zyte_api._session.LocationSessionConfig.location_checks`. + """:class:`~scrapy_zyte_api.SessionConfig` subclass to minimize boilerplate + when implementing location-specific session configs, i.e. session configs + where the default values should be used unless a location is set. + + Provides counterparts to some :class:`~scrapy_zyte_api.SessionConfig` + methods that are only called when a location is set, and get that location + as a parameter. """ def params(self, request): @@ -988,9 +993,13 @@ def check(self, response, request): return self.location_check(response, request, location) def location_params(self, request, location): - """Method to add params location-specific. Similar to :func:`~scrapy_zyte_api._session.SessionConfig.check`""" + """Like :class:`SessionConfig.params + `, but it is only called when a + location it set, and gets that *location* as a parameter.""" return super().params(request) def location_check(self, response, request, location): - """Method to check location-specific conditions. Similar to :func:`~scrapy_zyte_api._session.SessionConfig.check`""" + """Like :class:`SessionConfig.check + `, but it is only called when a + location it set, and gets that *location* as a parameter.""" return super().check(response, request) From 09302455c7dd360435751dc0b2ded3e48dc4ce09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 26 Aug 2024 18:13:33 +0200 Subject: [PATCH 3/6] Add type hints and a test --- scrapy_zyte_api/_session.py | 12 ++-- tests/test_sessions.py | 114 ++++++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+), 4 deletions(-) diff --git a/scrapy_zyte_api/_session.py b/scrapy_zyte_api/_session.py index a6e927e7..87313848 100644 --- a/scrapy_zyte_api/_session.py +++ b/scrapy_zyte_api/_session.py @@ -982,23 +982,27 @@ class LocationSessionConfig(SessionConfig): as a parameter. """ - def params(self, request): + def params(self, request: Request) -> Dict[str, Any]: if not (location := self.location(request)): return super().params(request) return self.location_params(request, location) - def check(self, response, request): + def check(self, response: Response, request: Request) -> bool: if not (location := self.location(request)): return super().check(response, request) return self.location_check(response, request, location) - def location_params(self, request, location): + def location_params( + self, request: Request, location: Dict[str, Any] + ) -> Dict[str, Any]: """Like :class:`SessionConfig.params `, but it is only called when a location it set, and gets that *location* as a parameter.""" return super().params(request) - def location_check(self, response, request, location): + def location_check( + self, response: Response, request: Request, location: Dict[str, Any] + ) -> bool: """Like :class:`SessionConfig.check `, but it is only called when a location it set, and gets that *location* as a parameter.""" diff --git a/tests/test_sessions.py b/tests/test_sessions.py index 282034e6..0fe7b22c 100644 --- a/tests/test_sessions.py +++ b/tests/test_sessions.py @@ -17,6 +17,7 @@ from scrapy_zyte_api import ( SESSION_AGGRESSIVE_RETRY_POLICY, SESSION_DEFAULT_RETRY_POLICY, + LocationSessionConfig, SessionConfig, is_session_init_request, session_config, @@ -2080,6 +2081,119 @@ class CustomSessionConfig(SessionConfig): pass +@ensureDeferred +async def test_location_session_config(mockserver): + pytest.importorskip("web_poet") + + @session_config( + [ + "postal-code-10001.example", + "postal-code-10001-fail.example", + "postal-code-10001-alternative.example", + ] + ) + class CustomSessionConfig(LocationSessionConfig): + + def location_params( + self, request: Request, location: Dict[str, Any] + ) -> Dict[str, Any]: + assert location == {"postalCode": "10002"} + return { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + } + ] + } + + def location_check( + self, response: Response, request: Request, location: Dict[str, Any] + ) -> bool: + assert location == {"postalCode": "10002"} + domain = urlparse_cached(request).netloc + return "fail" not in domain + + def pool(self, request: Request) -> str: + domain = urlparse_cached(request).netloc + if domain == "postal-code-10001-alternative.example": + return "postal-code-10001.example" + return domain + + settings = { + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_ENABLED": True, + # We set a location to force the location-specific methods of the + # session config class to be called, but we set the wrong location so + # that the test would not pass were it not for our custom + # implementation which ignores the input location and instead sets the + # right one. + "ZYTE_API_SESSION_LOCATION": {"postalCode": "10002"}, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = [ + "https://postal-code-10001.example", + "https://postal-code-10001-alternative.example", + "https://postal-code-10001-fail.example", + ] + + def start_requests(self): + for url in self.start_urls: + yield Request( + url, + meta={ + "zyte_api_automap": { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + } + ] + }, + }, + ) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/check-passed": 2, + "scrapy-zyte-api/sessions/pools/postal-code-10001.example/use/check-passed": 2, + "scrapy-zyte-api/sessions/pools/postal-code-10001-fail.example/init/check-failed": 1, + } + + # Clean up the session config registry, and check it, otherwise we could + # affect other tests. + + session_config_registry.__init__() # type: ignore[misc] + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/failed": 1, + "scrapy-zyte-api/sessions/pools/postal-code-10001-alternative.example/init/failed": 1, + "scrapy-zyte-api/sessions/pools/postal-code-10001-fail.example/init/failed": 1, + } + + @ensureDeferred async def test_session_refresh(mockserver): """If a response does not pass a session validity check, the session is From 5905d465e1420e8a40f33425c023910a7e920887 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 27 Aug 2024 13:03:04 +0200 Subject: [PATCH 4/6] Improve test coverage --- tests/test_sessions.py | 141 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) diff --git a/tests/test_sessions.py b/tests/test_sessions.py index 0fe7b22c..cc249d3c 100644 --- a/tests/test_sessions.py +++ b/tests/test_sessions.py @@ -2194,6 +2194,147 @@ def parse(self, response): } +@ensureDeferred +async def test_location_session_config_no_methods(mockserver): + """If no location_* methods are defined, LocationSessionConfig works the + same as SessionConfig.""" + pytest.importorskip("web_poet") + + @session_config( + [ + "postal-code-10001.example", + "postal-code-10001-alternative.example", + ] + ) + class CustomSessionConfig(LocationSessionConfig): + + def pool(self, request: Request) -> str: + domain = urlparse_cached(request).netloc + if domain == "postal-code-10001-alternative.example": + return "postal-code-10001.example" + return domain + + settings = { + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_LOCATION": {"postalCode": "10001"}, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = [ + "https://postal-code-10001.example", + "https://postal-code-10001-alternative.example", + ] + + def start_requests(self): + for url in self.start_urls: + yield Request( + url, + meta={ + "zyte_api_automap": { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + } + ] + }, + }, + ) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/check-passed": 2, + "scrapy-zyte-api/sessions/pools/postal-code-10001.example/use/check-passed": 2, + } + + # Clean up the session config registry, and check it, otherwise we could + # affect other tests. + + session_config_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_location_session_config_no_location(mockserver): + """If no location is configured, the methods are never called.""" + pytest.importorskip("web_poet") + + @session_config(["postal-code-10001.example", "a.example"]) + class CustomSessionConfig(LocationSessionConfig): + + def location_params( + self, request: Request, location: Dict[str, Any] + ) -> Dict[str, Any]: + assert False + + def location_check( + self, response: Response, request: Request, location: Dict[str, Any] + ) -> bool: + assert False + + settings = { + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://postal-code-10001.example", "https://a.example"] + + def start_requests(self): + for url in self.start_urls: + yield Request( + url, + meta={ + "zyte_api_automap": { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + } + ] + }, + }, + ) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/failed": 1, + "scrapy-zyte-api/sessions/pools/a.example/init/check-passed": 1, + "scrapy-zyte-api/sessions/pools/a.example/use/check-passed": 1, + } + + # Clean up the session config registry, and check it, otherwise we could + # affect other tests. + + session_config_registry.__init__() # type: ignore[misc] + + @ensureDeferred async def test_session_refresh(mockserver): """If a response does not pass a session validity check, the session is From 7cdae9e801f69e09e82724965e186f181ad54c86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 9 Sep 2024 14:26:30 +0200 Subject: [PATCH 5/6] Update scrapy_zyte_api/_session.py Co-authored-by: Andrey Rakhmatullin --- scrapy_zyte_api/_session.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_zyte_api/_session.py b/scrapy_zyte_api/_session.py index 87313848..10fc7d4d 100644 --- a/scrapy_zyte_api/_session.py +++ b/scrapy_zyte_api/_session.py @@ -997,7 +997,7 @@ def location_params( ) -> Dict[str, Any]: """Like :class:`SessionConfig.params `, but it is only called when a - location it set, and gets that *location* as a parameter.""" + location is set, and gets that *location* as a parameter.""" return super().params(request) def location_check( From 5e771e0812633f4b6ffa30ffba6ea11fbd8338b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 9 Sep 2024 14:26:39 +0200 Subject: [PATCH 6/6] Update scrapy_zyte_api/_session.py Co-authored-by: Andrey Rakhmatullin --- scrapy_zyte_api/_session.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_zyte_api/_session.py b/scrapy_zyte_api/_session.py index 10fc7d4d..eb674492 100644 --- a/scrapy_zyte_api/_session.py +++ b/scrapy_zyte_api/_session.py @@ -1005,5 +1005,5 @@ def location_check( ) -> bool: """Like :class:`SessionConfig.check `, but it is only called when a - location it set, and gets that *location* as a parameter.""" + location is set, and gets that *location* as a parameter.""" return super().check(response, request)