diff --git a/CHANGES.rst b/CHANGES.rst index e61df16d..b890acfe 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,13 @@ Changes ======= +N.N.N (YYYY-MM-DD) +------------------ + +* ``scrapy-zyte-api[provider]`` now requires zyte-common-items >= 0.20.0. + +* Added the :setting:`ZYTE_API_AUTO_FIELD_STATS` setting. + 0.21.0 (2024-07-02) ------------------- diff --git a/docs/reference/settings.rst b/docs/reference/settings.rst index b62ca257..77922721 100644 --- a/docs/reference/settings.rst +++ b/docs/reference/settings.rst @@ -6,6 +6,40 @@ Settings :ref:`Settings ` for scrapy-zyte-api. +.. setting:: ZYTE_API_AUTO_FIELD_STATS + +ZYTE_API_AUTO_FIELD_STATS +========================= + +Default: ``False`` + +Enables stats that indicate which requested fields :ref:`obtained through +scrapy-poet integration ` come directly from +:ref:`zyte-api-extract`. + +If for any request no page object class is used to override +:ref:`zyte-api-extract` fields for a given item type, the following stat is +set: + +.. code-block:: python + + "scrapy-zyte-api/auto_fields/": "(all fields)" + +.. note:: A literal ``(all fields)`` string is used as value, not a list with + all fields. + +If for any request a custom page object class is used to override some +:ref:`zyte-api-extract` fields, the following stat is set: + +.. code-block:: python + + "scrapy-zyte-api/auto_fields/": ( + "" + ) + +.. note:: :func:`zyte_common_items.fields.is_auto_field` is used to determine + whether a field has been overridden or not. + .. setting:: ZYTE_API_AUTOMAP_PARAMS ZYTE_API_AUTOMAP_PARAMS diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index 3a7979fc..4042775c 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -1,4 +1,4 @@ -from typing import Any, Callable, Dict, List, Optional, Sequence, Set +from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Type, cast from andi.typeutils import is_typing_annotated, strip_annotated from scrapy import Request @@ -13,16 +13,26 @@ HttpResponseHeaders, ) from web_poet.annotated import AnnotatedInstance +from web_poet.fields import get_fields_dict +from web_poet.utils import get_fq_class_name from zyte_common_items import ( Article, ArticleList, ArticleNavigation, + AutoArticleListPage, + AutoArticleNavigationPage, + AutoArticlePage, + AutoJobPostingPage, + AutoProductListPage, + AutoProductNavigationPage, + AutoProductPage, Item, JobPosting, Product, ProductList, ProductNavigation, ) +from zyte_common_items.fields import is_auto_field from scrapy_zyte_api import Actions, ExtractFrom, Geolocation, Screenshot from scrapy_zyte_api._annotations import _ActionResult @@ -35,6 +45,26 @@ NO_CALLBACK = None +_ITEM_KEYWORDS: Dict[type, str] = { + Product: "product", + ProductList: "productList", + ProductNavigation: "productNavigation", + Article: "article", + ArticleList: "articleList", + ArticleNavigation: "articleNavigation", + JobPosting: "jobPosting", +} +_AUTO_PAGES: Set[type] = { + AutoArticlePage, + AutoArticleListPage, + AutoArticleNavigationPage, + AutoJobPostingPage, + AutoProductPage, + AutoProductListPage, + AutoProductNavigationPage, +} + + class ZyteApiProvider(PageObjectInputProvider): name = "zyte_api" @@ -54,9 +84,38 @@ class ZyteApiProvider(PageObjectInputProvider): Screenshot, } + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._should_track_auto_fields = None + self._tracked_auto_fields = set() + def is_provided(self, type_: Callable) -> bool: return super().is_provided(strip_annotated(type_)) + def _track_auto_fields(self, crawler: Crawler, request: Request, cls: Type): + if cls not in _ITEM_KEYWORDS: + return + if self._should_track_auto_fields is None: + self._should_track_auto_fields = crawler.settings.getbool( + "ZYTE_API_AUTO_FIELD_STATS", False + ) + if self._should_track_auto_fields is False: + return + cls = self.injector.registry.page_cls_for_item(request.url, cls) or cls + if cls in self._tracked_auto_fields: + return + self._tracked_auto_fields.add(cls) + if cls in _ITEM_KEYWORDS: + field_list = "(all fields)" + else: + auto_fields = set() + for field_name in get_fields_dict(cls): + if is_auto_field(cls, field_name): # type: ignore[arg-type] + auto_fields.add(field_name) + field_list = " ".join(sorted(auto_fields)) + cls_fqn = get_fq_class_name(cls) + crawler.stats.set_value(f"scrapy-zyte-api/auto_fields/{cls_fqn}", field_list) + async def __call__( # noqa: C901 self, to_provide: Set[Callable], request: Request, crawler: Crawler ) -> Sequence[Any]: @@ -66,6 +125,7 @@ async def __call__( # noqa: C901 http_response = None screenshot_requested = Screenshot in to_provide for cls in list(to_provide): + self._track_auto_fields(crawler, request, cast(type, cls)) item = self.injector.weak_cache.get(request, {}).get(cls) if item: results.append(item) @@ -89,15 +149,6 @@ async def __call__( # noqa: C901 return results html_requested = BrowserResponse in to_provide or BrowserHtml in to_provide - item_keywords: Dict[type, str] = { - Product: "product", - ProductList: "productList", - ProductNavigation: "productNavigation", - Article: "article", - ArticleList: "articleList", - ArticleNavigation: "articleNavigation", - JobPosting: "jobPosting", - } zyte_api_meta = { **crawler.settings.getdict("ZYTE_API_PROVIDER_PARAMS"), @@ -135,7 +186,7 @@ async def __call__( # noqa: C901 } ) continue - kw = item_keywords.get(cls_stripped) + kw = _ITEM_KEYWORDS.get(cls_stripped) if not kw: continue item_requested = True @@ -165,7 +216,7 @@ async def __call__( # noqa: C901 ) extract_from = None # type: ignore[assignment] - for item_type, kw in item_keywords.items(): + for item_type, kw in _ITEM_KEYWORDS.items(): options_name = f"{kw}Options" if item_type not in to_provide_stripped and options_name in zyte_api_meta: del zyte_api_meta[options_name] @@ -271,7 +322,7 @@ async def __call__( # noqa: C901 result = AnnotatedInstance(Actions(actions_result), cls.__metadata__) # type: ignore[attr-defined] results.append(result) continue - kw = item_keywords.get(cls_stripped) + kw = _ITEM_KEYWORDS.get(cls_stripped) if not kw: continue assert issubclass(cls_stripped, Item) diff --git a/setup.py b/setup.py index 05c9c532..ac2de981 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ def get_version(): "andi>=0.6.0", "scrapy-poet>=0.22.3", "web-poet>=0.17.0", - "zyte-common-items>=0.8.0", + "zyte-common-items>=0.20.0", ] }, classifiers=[ diff --git a/tests/test_providers.py b/tests/test_providers.py index 7b4abb05..74dd17cc 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -1,4 +1,5 @@ import sys +from collections import defaultdict import pytest @@ -18,14 +19,17 @@ BrowserResponse, HttpResponse, ItemPage, + default_registry, field, handle_urls, ) -from zyte_common_items import BasePage, Product +from web_poet.pages import get_item_cls +from zyte_common_items import AutoProductPage, BasePage, BaseProductPage, Product +from zyte_common_items.fields import auto_field from scrapy_zyte_api import Actions, ExtractFrom, Geolocation, Screenshot, actions from scrapy_zyte_api.handler import ScrapyZyteAPIDownloadHandler -from scrapy_zyte_api.providers import ZyteApiProvider +from scrapy_zyte_api.providers import _AUTO_PAGES, _ITEM_KEYWORDS, ZyteApiProvider from . import SETTINGS from .mockserver import get_ephemeral_port @@ -1015,3 +1019,592 @@ def parse_(self, response: DummyResponse, page: ActionProductPage): # type: ign }, ] ) + + +def test_auto_pages_set(): + assert set(_ITEM_KEYWORDS) == {get_item_cls(cls) for cls in _AUTO_PAGES} # type: ignore[call-overload] + + +@ensureDeferred +async def test_auto_field_stats_not_enabled(mockserver): + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + pass + + settings = create_scrapy_settings() + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + _, _, crawler = await crawl_single_item(TestSpider, HtmlResource, settings) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == {} + + +@ensureDeferred +async def test_auto_field_stats_no_override(mockserver): + """When requesting an item directly from Zyte API, without an override to + change fields, stats reflect the entire list of item fields.""" + + from scrapy.statscollectors import MemoryStatsCollector + + duplicate_stat_calls: defaultdict[str, int] = defaultdict(int) + + class OnlyOnceStatsCollector(MemoryStatsCollector): + + def track_duplicate_stat_calls(self, key): + if key.startswith("scrapy-zyte-api/auto_fields/") and key in self._stats: + duplicate_stat_calls[key] += 1 + + def set_value(self, key, value, spider=None): + self.track_duplicate_stat_calls(key) + super().set_value(key, value, spider) + + def inc_value(self, key, count=1, start=1, spider=None): + self.track_duplicate_stat_calls(key) + super().inc_value(key, count, start, spider) + + def max_value(self, key, value, spider=None): + self.track_duplicate_stat_calls(key) + super().max_value(key, value, spider) + + def min_value(self, key, value, spider=None): + self.track_duplicate_stat_calls(key) + super().min_value(key, value, spider) + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + for url in ("data:,a", "data:,b"): + yield Request(url, callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + pass + + settings = create_scrapy_settings() + settings["STATS_CLASS"] = OnlyOnceStatsCollector + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + _, _, crawler = await crawl_single_item(TestSpider, HtmlResource, settings) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/zyte_common_items.items.product.Product": ( + "(all fields)" + ), + } + assert all(value == 0 for value in duplicate_stat_calls.values()) + + +@ensureDeferred +async def test_auto_field_stats_partial_override(mockserver): + """When requesting an item and having an Auto…Page subclass to change + fields, stats reflect the list of item fields not defined in the + subclass. Defined field methods are not listed, even if they return the + original item field, directly or as a fallback.""" + + class MyProductPage(AutoProductPage): + + @field + def brand(self): + return "foo" + + @field + def name(self): + return self.product.name + + handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + pass + + settings = create_scrapy_settings() + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stats_partial_override..MyProductPage": ( + "additionalProperties aggregateRating availability breadcrumbs " + "canonicalUrl color currency currencyRaw description descriptionHtml " + "features gtin images mainImage metadata mpn price productId " + "regularPrice size sku style url variants" + ), + } + + # Reset rules + default_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_auto_field_stats_full_override(mockserver): + """When requesting an item and having an Auto…Page subclass to change + all fields, stats reflect the list of non-overriden item fields as an empty + string.""" + + # Copy-paste of fields from the AutoProductPage implementation, with type + # hints removed. + class MyProductPage(AutoProductPage): + + @field + def additionalProperties(self): + return self.product.additionalProperties + + @field + def aggregateRating(self): + return self.product.aggregateRating + + @field + def availability(self): + return self.product.availability + + @field + def brand(self): + return self.product.brand + + @field + def breadcrumbs(self): + return self.product.breadcrumbs + + @field + def canonicalUrl(self): + return self.product.canonicalUrl + + @field + def color(self): + return self.product.color + + @field + def currency(self): + return self.product.currency + + @field + def currencyRaw(self): + return self.product.currencyRaw + + @field + def description(self): + return self.product.description + + @field + def descriptionHtml(self): + return self.product.descriptionHtml + + @field + def features(self): + return self.product.features + + @field + def gtin(self): + return self.product.gtin + + @field + def images(self): + return self.product.images + + @field + def mainImage(self): + return self.product.mainImage + + @field + def metadata(self): + return self.product.metadata + + @field + def mpn(self): + return self.product.mpn + + @field + def name(self): + return self.product.name + + @field + def price(self): + return self.product.price + + @field + def productId(self): + return self.product.productId + + @field + def regularPrice(self): + return self.product.regularPrice + + @field + def size(self): + return self.product.size + + @field + def sku(self): + return self.product.sku + + @field + def style(self): + return self.product.style + + @field + def url(self): + return self.product.url + + @field + def variants(self): + return self.product.variants + + handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + pass + + settings = create_scrapy_settings() + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stats_full_override..MyProductPage": "", + } + + # Reset rules + default_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_auto_field_stats_callback_override(mockserver): + """Fields overridden in callbacks, instead of using a page object, are not + taken into account.""" + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + product.name = "foo" + yield product + + settings = create_scrapy_settings() + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/zyte_common_items.items.product.Product": ( + "(all fields)" + ), + } + + # Reset rules + default_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_auto_field_stats_item_page_override(mockserver): + """The stat accounts for the configured page for a given item, so if you + request that page directly, things work the same as if you request the item + itself.""" + + class MyProductPage(AutoProductPage): + + @field + def brand(self): + return "foo" + + @field + def name(self): + return self.product.name + + handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, page: MyProductPage): + pass + + settings = create_scrapy_settings() + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stats_item_page_override..MyProductPage": ( + "additionalProperties aggregateRating availability breadcrumbs " + "canonicalUrl color currency currencyRaw description descriptionHtml " + "features gtin images mainImage metadata mpn price productId " + "regularPrice size sku style url variants" + ), + } + + # Reset rules + default_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_auto_field_stats_alt_page_override(mockserver): + """The stat does not account for alternatives pages, so if you request a + page that provides an item, the page that counts for stats is the + configured page for that item, not the actual page requested.""" + + class MyProductPage(AutoProductPage): + + @field + def brand(self): + return "foo" + + @field + def name(self): + return self.product.name + + handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) + + class AltProductPage(AutoProductPage): + + @field + def sku(self): + return "foo" + + @field + def currencyRaw(self): + return self.product.currencyRaw + + handle_urls(f"{mockserver.host}:{mockserver.port}", priority=0)(AltProductPage) + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, page: AltProductPage): + pass + + settings = create_scrapy_settings() + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stats_alt_page_override..MyProductPage": ( + "additionalProperties aggregateRating availability breadcrumbs " + "canonicalUrl color currency currencyRaw description descriptionHtml " + "features gtin images mainImage metadata mpn price productId " + "regularPrice size sku style url variants" + ), + } + + # Reset rules + default_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_auto_field_stats_non_auto_override(mockserver): + """If instead of using an Auto…Page class you use a custom class, all + fields are assumed to be overridden.""" + + @attrs.define + class MyProductPage(BaseProductPage): + product: Product + + @field + def additionalProperties(self): + return self.product.additionalProperties + + handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + pass + + settings = create_scrapy_settings() + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stats_non_auto_override..MyProductPage": "", + } + + # Reset rules + default_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_auto_field_stats_auto_field_decorator(mockserver): + """Using @auto_field forces a field to not be considered overridden.""" + + @attrs.define + class MyProductPage(BaseProductPage): + product: Product + + @auto_field + def additionalProperties(self): + return self.product.additionalProperties + + handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + pass + + settings = create_scrapy_settings() + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stats_auto_field_decorator..MyProductPage": "additionalProperties", + } + + # Reset rules + default_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_auto_field_stats_auto_field_meta(mockserver): + """Using @field(meta={"auto_field": True}) has the same effect as using + @auto_field.""" + + @attrs.define + class MyProductPage(BaseProductPage): + product: Product + + @field(meta={"auto_field": True}) + def additionalProperties(self): + return self.product.additionalProperties + + handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + pass + + settings = create_scrapy_settings() + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stats_auto_field_meta..MyProductPage": "additionalProperties", + } + + # Reset rules + default_registry.__init__() # type: ignore[misc] diff --git a/tox.ini b/tox.ini index 073909e5..086d7971 100644 --- a/tox.ini +++ b/tox.ini @@ -90,7 +90,7 @@ deps = andi==0.6.0 scrapy-poet==0.22.3 web-poet==0.17.0 - zyte-common-items==0.8.0 + zyte-common-items==0.20.0 [testenv:pinned-extra] basepython=python3.8