Skip to content

Commit

Permalink
Add the Scrapy addon.
Browse files Browse the repository at this point in the history
  • Loading branch information
wRAR committed Jun 30, 2023
1 parent 3c82d07 commit b0095f5
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 5 deletions.
18 changes: 17 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,12 @@ Installation
Configuration
=============

To enable this plugin:
To enable this plugin, if your Scrapy version supports addons:

- Add ``"scrapy_zyte_api.ScrapyZyteAPIAddon"`` to the ``ADDONS`` setting with
any priority.

Otherwise:

- Set the ``http`` and ``https`` keys in the `DOWNLOAD_HANDLERS
<https://docs.scrapy.org/en/latest/topics/settings.html#std-setting-DOWNLOAD_HANDLERS>`_
Expand All @@ -62,13 +67,24 @@ To enable this plugin:
Scrapy setting to
``"twisted.internet.asyncioreactor.AsyncioSelectorReactor"``.

In both cases:

- Set `your Zyte API key
<https://docs.zyte.com/zyte-api/usage/general.html#authorization>`_ as
either the ``ZYTE_API_KEY`` Scrapy setting or as an environment variable of
the same name.

For example, in the ``settings.py`` file of your Scrapy project:

.. code-block:: python
ADDONS = {
"scrapy_zyte_api.ScrapyZyteAPIAddon": 1,
}
ZYTE_API_KEY = "YOUR_API_KEY"
or::

.. code-block:: python
DOWNLOAD_HANDLERS = {
Expand Down
1 change: 1 addition & 0 deletions scrapy_zyte_api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@

from ._downloader_middleware import ScrapyZyteAPIDownloaderMiddleware # NOQA
from ._request_fingerprinter import ScrapyZyteAPIRequestFingerprinter # NOQA
from .addon import ScrapyZyteAPIAddon # NOQA
from .handler import ScrapyZyteAPIDownloadHandler # NOQA
24 changes: 24 additions & 0 deletions scrapy_zyte_api/addon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from scrapy.settings import BaseSettings

from scrapy_zyte_api import ScrapyZyteAPIDownloaderMiddleware


class ScrapyZyteAPIAddon:
def update_settings(self, settings: BaseSettings) -> None:
settings["DOWNLOAD_HANDLERS"][
"http"
] = "scrapy_zyte_api.ScrapyZyteAPIDownloadHandler"
settings["DOWNLOAD_HANDLERS"][
"https"
] = "scrapy_zyte_api.ScrapyZyteAPIDownloadHandler"
settings["DOWNLOADER_MIDDLEWARES"][ScrapyZyteAPIDownloaderMiddleware] = 1000
settings.set(
"REQUEST_FINGERPRINTER_CLASS",
"scrapy_zyte_api.ScrapyZyteAPIRequestFingerprinter",
"addon",
)
settings.set(
"TWISTED_REACTOR",
"twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"addon",
)
17 changes: 13 additions & 4 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
from contextlib import asynccontextmanager, contextmanager
from os import environ
from typing import Optional
from typing import Any, Dict, Optional

from scrapy import Spider
from scrapy.crawler import Crawler
from scrapy.utils.test import get_crawler as _get_crawler
from zyte_api.aio.client import AsyncClient

from scrapy_zyte_api.addon import ScrapyZyteAPIAddon
from scrapy_zyte_api.handler import ScrapyZyteAPIDownloadHandler

_API_KEY = "a"

DEFAULT_CLIENT_CONCURRENCY = AsyncClient(api_key=_API_KEY).n_conn
SETTINGS = {
SETTINGS: Dict[str, Any] = {
"DOWNLOAD_HANDLERS": {
"http": "scrapy_zyte_api.handler.ScrapyZyteAPIDownloadHandler",
"https": "scrapy_zyte_api.handler.ScrapyZyteAPIDownloadHandler",
Expand All @@ -21,6 +22,12 @@
"ZYTE_API_KEY": _API_KEY,
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
}
SETTINGS_ADDON: Dict[str, Any] = {
"ADDONS": {
ScrapyZyteAPIAddon: 1,
},
"ZYTE_API_KEY": _API_KEY,
}
UNSET = object()


Expand Down Expand Up @@ -49,8 +56,10 @@ def get_download_handler(crawler, schema):


@asynccontextmanager
async def make_handler(settings: dict, api_url: Optional[str] = None):
settings = {**SETTINGS, **settings}
async def make_handler(
settings: Dict[str, Any], api_url: Optional[str] = None, *, use_addon: bool = False
):
settings = {**(SETTINGS if not use_addon else SETTINGS_ADDON), **settings}
if api_url is not None:
settings["ZYTE_API_URL"] = api_url
crawler = get_crawler(settings)
Expand Down
9 changes: 9 additions & 0 deletions tests/test_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,3 +430,12 @@ def test_log_request_truncate_negative(enabled):
settings=None,
crawler=crawler,
)


@ensureDeferred
async def test_addon(mockserver):
async with make_handler({}, mockserver.urljoin("/"), use_addon=True) as handler:
meta = {"zyte_api": {"foo": "bar"}}
request = Request("https://example.com", meta=meta)
await handler.download_request(request, None)
assert handler._stats.get_value("scrapy-zyte-api/success") == 1

0 comments on commit b0095f5

Please sign in to comment.