diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index 34f5159b..4042775c 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -4,7 +4,7 @@ from scrapy import Request from scrapy.crawler import Crawler from scrapy.utils.defer import maybe_deferred_to_future -from scrapy_poet import InjectionMiddleware, PageObjectInputProvider +from scrapy_poet import PageObjectInputProvider from web_poet import ( AnyResponse, BrowserHtml, @@ -86,7 +86,6 @@ class ZyteApiProvider(PageObjectInputProvider): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self._injection_mw = None self._should_track_auto_fields = None self._tracked_auto_fields = set() @@ -102,23 +101,7 @@ def _track_auto_fields(self, crawler: Crawler, request: Request, cls: Type): ) if self._should_track_auto_fields is False: return - if self._injection_mw is None: - try: - self._injection_mw = crawler.get_downloader_middleware( - InjectionMiddleware - ) - except AttributeError: - for component in crawler.engine.downloader.middleware.middlewares: - if isinstance(component, InjectionMiddleware): - self._injection_mw = component - break - if self._injection_mw is None: - raise RuntimeError( - "Could not find the InjectionMiddleware among enabled " - "downloader middlewares. Please, ensure you have properly " - "configured scrapy-poet." - ) - cls = self._injection_mw.registry.page_cls_for_item(request.url, cls) or cls + cls = self.injector.registry.page_cls_for_item(request.url, cls) or cls if cls in self._tracked_auto_fields: return self._tracked_auto_fields.add(cls) diff --git a/tests/test_providers.py b/tests/test_providers.py index 774cb292..1f8dce8a 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -1,4 +1,5 @@ import sys +from collections import defaultdict import pytest @@ -1054,17 +1055,45 @@ async def test_auto_field_stats_no_override(mockserver): """When requesting an item directly from Zyte API, without an override to change fields, stats reflect the entire list of item fields.""" + from scrapy.statscollectors import MemoryStatsCollector + + duplicate_stat_calls = defaultdict(int) + + class OnlyOnceStatsCollector(MemoryStatsCollector): + + def track_duplicate_stat_calls(self, key): + if key.startswith("scrapy-zyte-api/auto_fields/") and key in self._stats: + duplicate_stat_calls[key] += 1 + + def set_value(self, key, value, spider=None): + self.track_duplicate_stat_calls(key) + super().set_value(key, value, spider) + + def inc_value(self, key, count=1, start=1, spider=None): + self.track_duplicate_stat_calls(key) + super().inc_value(key, count, start, spider) + + def max_value(self, key, value, spider=None): + self.track_duplicate_stat_calls(key) + super().max_value(key, value, spider) + + def min_value(self, key, value, spider=None): + self.track_duplicate_stat_calls(key) + super().min_value(key, value, spider) + class TestSpider(Spider): name = "test_spider" url: str def start_requests(self): - yield Request(self.url, callback=self.parse) + for url in ("data:,a", "data:,b"): + yield Request(url, callback=self.parse) def parse(self, response: DummyResponse, product: Product): pass settings = create_scrapy_settings() + settings["STATS_CLASS"] = OnlyOnceStatsCollector settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} settings["ZYTE_API_AUTO_FIELD_STATS"] = True settings["ZYTE_API_URL"] = mockserver.urljoin("/") @@ -1080,6 +1109,7 @@ def parse(self, response: DummyResponse, product: Product): "(all fields)" ), } + assert all(value == 0 for value in duplicate_stat_calls.values()) @ensureDeferred