Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add LocationSessionConfig with docstrings #215

Merged
merged 6 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions docs/usage/session.rst
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,15 @@ To define a different session config for a given URL pattern, install

.. autofunction:: scrapy_zyte_api.session_config

If you only need to override the :meth:`SessionConfig.check
<scrapy_zyte_api.SessionConfig.check>` or :meth:`SessionConfig.params
<scrapy_zyte_api.SessionConfig.params>` methods for scenarios involving a
location, you may subclass :class:`~scrapy_zyte_api.LocationSessionConfig`
instead:

.. autoclass:: scrapy_zyte_api.LocationSessionConfig
:members: location_check, location_params

If in a session config implementation or in any other Scrapy component you need
to tell whether a request is a :ref:`session initialization request
<session-init>` or not, use :func:`~scrapy_zyte_api.is_session_init_request`:
Expand Down
1 change: 1 addition & 0 deletions scrapy_zyte_api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
)
from ._session import SESSION_DEFAULT_RETRY_POLICY as _SESSION_DEFAULT_RETRY_POLICY
from ._session import (
LocationSessionConfig,
ScrapyZyteAPISessionDownloaderMiddleware,
SessionConfig,
is_session_init_request,
Expand Down
41 changes: 41 additions & 0 deletions scrapy_zyte_api/_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,8 @@
The returned parameters do not need to include :http:`request:url`. If
missing, it is picked from the request :ref:`triggering a session
initialization request <pool-size>`.

.. seealso:: :class:`~scrapy_zyte_api.LocationSessionConfig`
"""
if location := self.location(request):
return {
Expand All @@ -372,6 +374,8 @@
If you need to tell whether *request* is a :ref:`session initialization
request <session-init>` or not, use
:func:`~scrapy_zyte_api.is_session_init_request`.

.. seealso:: :class:`~scrapy_zyte_api.LocationSessionConfig`
"""
if self._checker:
return self._checker.check(response, request)
Expand Down Expand Up @@ -966,3 +970,40 @@
spider=spider,
reason=reason,
)


class LocationSessionConfig(SessionConfig):
""":class:`~scrapy_zyte_api.SessionConfig` subclass to minimize boilerplate
when implementing location-specific session configs, i.e. session configs
where the default values should be used unless a location is set.

Provides counterparts to some :class:`~scrapy_zyte_api.SessionConfig`
methods that are only called when a location is set, and get that location
as a parameter.
"""

def params(self, request: Request) -> Dict[str, Any]:
if not (location := self.location(request)):
return super().params(request)
return self.location_params(request, location)

Check warning on line 988 in scrapy_zyte_api/_session.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/_session.py#L987-L988

Added lines #L987 - L988 were not covered by tests

def check(self, response: Response, request: Request) -> bool:
if not (location := self.location(request)):
return super().check(response, request)
return self.location_check(response, request, location)

Check warning on line 993 in scrapy_zyte_api/_session.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/_session.py#L992-L993

Added lines #L992 - L993 were not covered by tests

def location_params(
self, request: Request, location: Dict[str, Any]
) -> Dict[str, Any]:
"""Like :class:`SessionConfig.params
<scrapy_zyte_api.SessionConfig.params>`, but it is only called when a
location it set, and gets that *location* as a parameter."""
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved
return super().params(request)

Check warning on line 1001 in scrapy_zyte_api/_session.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/_session.py#L1001

Added line #L1001 was not covered by tests

def location_check(
self, response: Response, request: Request, location: Dict[str, Any]
) -> bool:
"""Like :class:`SessionConfig.check
<scrapy_zyte_api.SessionConfig.check>`, but it is only called when a
location it set, and gets that *location* as a parameter."""
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved
return super().check(response, request)

Check warning on line 1009 in scrapy_zyte_api/_session.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/_session.py#L1009

Added line #L1009 was not covered by tests
255 changes: 255 additions & 0 deletions tests/test_sessions.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from scrapy_zyte_api import (
SESSION_AGGRESSIVE_RETRY_POLICY,
SESSION_DEFAULT_RETRY_POLICY,
LocationSessionConfig,
SessionConfig,
is_session_init_request,
session_config,
Expand Down Expand Up @@ -2080,6 +2081,260 @@ class CustomSessionConfig(SessionConfig):
pass


@ensureDeferred
async def test_location_session_config(mockserver):
pytest.importorskip("web_poet")

@session_config(
[
"postal-code-10001.example",
"postal-code-10001-fail.example",
"postal-code-10001-alternative.example",
]
)
class CustomSessionConfig(LocationSessionConfig):

def location_params(
self, request: Request, location: Dict[str, Any]
) -> Dict[str, Any]:
assert location == {"postalCode": "10002"}
return {
"actions": [
{
"action": "setLocation",
"address": {"postalCode": "10001"},
}
]
}

def location_check(
self, response: Response, request: Request, location: Dict[str, Any]
) -> bool:
assert location == {"postalCode": "10002"}
domain = urlparse_cached(request).netloc
return "fail" not in domain

def pool(self, request: Request) -> str:
domain = urlparse_cached(request).netloc
if domain == "postal-code-10001-alternative.example":
return "postal-code-10001.example"
return domain

settings = {
"RETRY_TIMES": 0,
"ZYTE_API_URL": mockserver.urljoin("/"),
"ZYTE_API_SESSION_ENABLED": True,
# We set a location to force the location-specific methods of the
# session config class to be called, but we set the wrong location so
# that the test would not pass were it not for our custom
# implementation which ignores the input location and instead sets the
# right one.
"ZYTE_API_SESSION_LOCATION": {"postalCode": "10002"},
"ZYTE_API_SESSION_MAX_BAD_INITS": 1,
}

class TestSpider(Spider):
name = "test"
start_urls = [
"https://postal-code-10001.example",
"https://postal-code-10001-alternative.example",
"https://postal-code-10001-fail.example",
]

def start_requests(self):
for url in self.start_urls:
yield Request(
url,
meta={
"zyte_api_automap": {
"actions": [
{
"action": "setLocation",
"address": {"postalCode": "10001"},
}
]
},
},
)

def parse(self, response):
pass

crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False)
await crawler.crawl()

session_stats = {
k: v
for k, v in crawler.stats.get_stats().items()
if k.startswith("scrapy-zyte-api/sessions")
}
assert session_stats == {
"scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/check-passed": 2,
"scrapy-zyte-api/sessions/pools/postal-code-10001.example/use/check-passed": 2,
"scrapy-zyte-api/sessions/pools/postal-code-10001-fail.example/init/check-failed": 1,
}

# Clean up the session config registry, and check it, otherwise we could
# affect other tests.

session_config_registry.__init__() # type: ignore[misc]

crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False)
await crawler.crawl()

session_stats = {
k: v
for k, v in crawler.stats.get_stats().items()
if k.startswith("scrapy-zyte-api/sessions")
}
assert session_stats == {
"scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/failed": 1,
"scrapy-zyte-api/sessions/pools/postal-code-10001-alternative.example/init/failed": 1,
"scrapy-zyte-api/sessions/pools/postal-code-10001-fail.example/init/failed": 1,
}


@ensureDeferred
async def test_location_session_config_no_methods(mockserver):
"""If no location_* methods are defined, LocationSessionConfig works the
same as SessionConfig."""
pytest.importorskip("web_poet")

@session_config(
[
"postal-code-10001.example",
"postal-code-10001-alternative.example",
]
)
class CustomSessionConfig(LocationSessionConfig):

def pool(self, request: Request) -> str:
domain = urlparse_cached(request).netloc
if domain == "postal-code-10001-alternative.example":
return "postal-code-10001.example"
return domain

settings = {
"RETRY_TIMES": 0,
"ZYTE_API_URL": mockserver.urljoin("/"),
"ZYTE_API_SESSION_ENABLED": True,
"ZYTE_API_SESSION_LOCATION": {"postalCode": "10001"},
"ZYTE_API_SESSION_MAX_BAD_INITS": 1,
}

class TestSpider(Spider):
name = "test"
start_urls = [
"https://postal-code-10001.example",
"https://postal-code-10001-alternative.example",
]

def start_requests(self):
for url in self.start_urls:
yield Request(
url,
meta={
"zyte_api_automap": {
"actions": [
{
"action": "setLocation",
"address": {"postalCode": "10001"},
}
]
},
},
)

def parse(self, response):
pass

crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False)
await crawler.crawl()

session_stats = {
k: v
for k, v in crawler.stats.get_stats().items()
if k.startswith("scrapy-zyte-api/sessions")
}
assert session_stats == {
"scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/check-passed": 2,
"scrapy-zyte-api/sessions/pools/postal-code-10001.example/use/check-passed": 2,
}

# Clean up the session config registry, and check it, otherwise we could
# affect other tests.

session_config_registry.__init__() # type: ignore[misc]


@ensureDeferred
async def test_location_session_config_no_location(mockserver):
"""If no location is configured, the methods are never called."""
pytest.importorskip("web_poet")

@session_config(["postal-code-10001.example", "a.example"])
class CustomSessionConfig(LocationSessionConfig):

def location_params(
self, request: Request, location: Dict[str, Any]
) -> Dict[str, Any]:
assert False

def location_check(
self, response: Response, request: Request, location: Dict[str, Any]
) -> bool:
assert False

settings = {
"RETRY_TIMES": 0,
"ZYTE_API_URL": mockserver.urljoin("/"),
"ZYTE_API_SESSION_ENABLED": True,
"ZYTE_API_SESSION_MAX_BAD_INITS": 1,
}

class TestSpider(Spider):
name = "test"
start_urls = ["https://postal-code-10001.example", "https://a.example"]

def start_requests(self):
for url in self.start_urls:
yield Request(
url,
meta={
"zyte_api_automap": {
"actions": [
{
"action": "setLocation",
"address": {"postalCode": "10001"},
}
]
},
},
)

def parse(self, response):
pass

crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False)
await crawler.crawl()

session_stats = {
k: v
for k, v in crawler.stats.get_stats().items()
if k.startswith("scrapy-zyte-api/sessions")
}
assert session_stats == {
"scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/failed": 1,
"scrapy-zyte-api/sessions/pools/a.example/init/check-passed": 1,
"scrapy-zyte-api/sessions/pools/a.example/use/check-passed": 1,
}

# Clean up the session config registry, and check it, otherwise we could
# affect other tests.

session_config_registry.__init__() # type: ignore[misc]


@ensureDeferred
async def test_session_refresh(mockserver):
"""If a response does not pass a session validity check, the session is
Expand Down