Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Article AI Spider #47

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ zyte-spider-templates documentation

templates/index
E-commerce <templates/e-commerce>
Article <templates/article>

.. toctree::
:caption: Customization
Expand Down
24 changes: 24 additions & 0 deletions docs/templates/article.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
.. _article:

==========================================
Article spider template (``article``)
==========================================

Basic use
=========

.. code-block:: shell

scrapy crawl article -a url="https://quotes.toscrape.com/"

Parameters
==========

.. autopydantic_model:: zyte_spider_templates.spiders.article.ArticleSpiderParams
:inherited-members: BaseModel

.. autoenum:: zyte_spider_templates.spiders.article.ArticleCrawlStrategy

.. autoenum:: zyte_spider_templates.spiders.base.ExtractFrom

.. autoenum:: zyte_spider_templates.spiders.base.Geolocation
3 changes: 3 additions & 0 deletions docs/templates/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,6 @@ Spider template list

:ref:`E-commerce <e-commerce>`
Get products from an e-commerce website.

:ref:`Article <article>`
Get articles from an article website.
147 changes: 147 additions & 0 deletions tests/pages/test_article_navigation_heuristics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import pytest
from pytest_twisted import ensureDeferred
from web_poet import AnyResponse, HttpResponse, PageParams, RequestUrl
from zyte_common_items import ArticleNavigation, ProbabilityRequest

from zyte_spider_templates.pages.article_navigation_heuristics import (
HeuristicsArticleNavigationPage,
)


@ensureDeferred
async def test_unknown_article_page():
body = b"""
<html>
<body>
<div>
<h1>Categories</h1>
<div>
<a href="https://example.com/category/news">News</a>
<a href="https://example.com/category/sports">Sports</a>
</div>
</div>
<div>
<h1>Articles</h1>
<div>
<a href="https://example.com/article?id=breaking-news">Breaking News</a>
<a href="https://example.com/article?id=latest-scores">Latest Scores</a>
</div>
<span>
<a href="https://example.com/page-2">Next Page</a>
</span>
</div>
<a href="https://example.com/category/probably-relevant">Probably Relevant?</a>
<footer>
<a href="https://example.com/privacy-policy">Privacy Policy</a>
<a href="https://another-example.com">Link to other domain</a>
</footer>
</body>
</html>
"""
response = AnyResponse(HttpResponse("https://example.com", body))
navigation = ArticleNavigation.from_dict(
{
"url": "https://example.com",
"subCategories": [
{"url": "https://example.com/category/news", "name": "News"},
{"url": "https://example.com/category/sports", "name": "Sports"},
],
"items": [
{
"url": "https://example.com/article?id=breaking-news",
"name": "Breaking News",
},
{
"url": "https://example.com/article?id=latest-scores",
"name": "Latest Scores",
},
],
"nextPage": {
"url": "https://example.com/page-2",
"name": "Next Page",
},
"metadata": {"dateDownloaded": "2024-01-09T14:37:58Z"},
}
)
all_valid_urls = [
"https://example.com/category/news",
"https://example.com/category/sports",
"https://example.com/article?id=breaking-news",
"https://example.com/article?id=latest-scores",
"https://example.com/page-2",
]
urls_subcategories = [
ProbabilityRequest.from_dict(
{"url": "https://example.com/category/news", "name": "News"}
),
ProbabilityRequest.from_dict(
{"url": "https://example.com/category/sports", "name": "Sports"}
),
]

# Heuristics turned OFF
request_url = RequestUrl(response.url)
page_params = PageParams({"allow_domains": "example.com"})
page = HeuristicsArticleNavigationPage(
request_url, navigation, response, page_params
)
item = await page.to_item()

assert item.subCategories == urls_subcategories
assert page._urls_for_navigation() == all_valid_urls

# Heuristics turned ON
page_params = PageParams({"full_domain": "example.com"})
page = HeuristicsArticleNavigationPage(
request_url, navigation, response, page_params
)
item = await page.to_item()

assert item.subCategories == urls_subcategories + [
ProbabilityRequest.from_dict(
{
"url": "https://example.com/category/probably-relevant",
"name": "[heuristics] Probably Relevant?",
"metadata": {"probability": 0.1},
}
)
]
assert page._urls_for_navigation() == all_valid_urls


@ensureDeferred
async def test_crawl_nofollow_links():
page_params = PageParams({"full_domain": "example.com"})
body = b"""
<html>
<body>
<div>
<a href="https://outside-example.com/can-follow">Outside link</a>
<a href="https://example.com/can-follow">Can follow</a>
<a href="https://example.com/dont-follow" rel="nofollow">Dont follow</a>
</div>
</body>
</html>
"""
url = "https://example.com"
response = AnyResponse(HttpResponse(url, body))
request_url = RequestUrl(response.url)
navigation = ArticleNavigation(url=url)

page = HeuristicsArticleNavigationPage(
request_url, navigation, response, page_params
)
assert [req.url for req in page.subCategories] == ["https://example.com/can-follow"]


def test_deprecated_page_objects():
with pytest.warns(DeprecationWarning, match="page_objects"):
from zyte_spider_templates.page_objects import ( # noqa: F401
HeuristicsArticleNavigationPage,
)

# We cannot test the warning again because duplicate warnings are ignored,
# but we still want to ensure that we can import the class.
from zyte_spider_templates.page_objects.article_navigation_heuristics import ( # noqa: F401, F811
HeuristicsArticleNavigationPage,
)
14 changes: 8 additions & 6 deletions tests/pages/test_product_navigation_heuristics.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ async def test_unknown_product_page():
item = await page.to_item()

assert item.subCategories == urls_subcategories
assert page._urls_for_category() == all_valid_urls
assert page._urls_for_navigation() == all_valid_urls

# Heuristics turned ON
page_params = PageParams({"full_domain": "example.com"})
Expand All @@ -100,7 +100,7 @@ async def test_unknown_product_page():
}
)
]
assert page._urls_for_category() == all_valid_urls
assert page._urls_for_navigation() == all_valid_urls


@ensureDeferred
Expand Down Expand Up @@ -129,10 +129,12 @@ async def test_crawl_nofollow_links():


def test_deprecated_page_objects():
with pytest.warns(DeprecationWarning, match="page_objects"):
from zyte_spider_templates.page_objects import ( # noqa: F401
HeuristicsProductNavigationPage,
)

# We cannot test this warning as it will be ignored after the test run for aticles
# with pytest.warns(DeprecationWarning, match="page_objects"):
# from zyte_spider_templates.page_objects import ( # noqa: F401
# HeuristicsProductNavigationPage,
# )

# We cannot test the warning again because duplicate warnings are ignored,
# but we still want to ensure that we can import the class.
Expand Down
Loading