From 0f7a583db228b374bd164d6371e882fbc46ba5b9 Mon Sep 17 00:00:00 2001 From: oltarasenko Date: Thu, 29 Aug 2024 16:05:06 +0200 Subject: [PATCH] Simplify Crawly.fetch/2 function (#306) I decided to refactor this code a bit, so it is more simple (and also credo was complaining about it). Hopefully Ziink will forgive me :) --- .../quickstart/lib/quickstart/books_spider.ex | 2 +- examples/quickstart/mix.lock | 18 ++- lib/crawly.ex | 149 +++++++----------- test/crawly_test.exs | 86 ++++++---- 4 files changed, 130 insertions(+), 125 deletions(-) diff --git a/examples/quickstart/lib/quickstart/books_spider.ex b/examples/quickstart/lib/quickstart/books_spider.ex index c95e95b5..d5a2eb0a 100644 --- a/examples/quickstart/lib/quickstart/books_spider.ex +++ b/examples/quickstart/lib/quickstart/books_spider.ex @@ -7,7 +7,7 @@ defmodule BooksToScrape do end @impl Crawly.Spider - def base_url(), do: "" + def base_url(), do: "https://books.toscrape.com/" @impl Crawly.Spider def parse_item(response) do diff --git a/examples/quickstart/mix.lock b/examples/quickstart/mix.lock index e04875b3..3ba7919d 100644 --- a/examples/quickstart/mix.lock +++ b/examples/quickstart/mix.lock @@ -1,25 +1,29 @@ %{ - "certifi": {:hex, :certifi, "2.9.0", "6f2a475689dd47f19fb74334859d460a2dc4e3252a3324bd2111b8f0429e7e21", [:rebar3], [], "hexpm", "266da46bdb06d6c6d35fde799bcb28d36d985d424ad7c08b5bb48f5b5cdd4641"}, + "certifi": {:hex, :certifi, "2.12.0", "2d1cca2ec95f59643862af91f001478c9863c2ac9cb6e2f89780bfd8de987329", [:rebar3], [], "hexpm", "ee68d85df22e554040cdb4be100f33873ac6051387baf6a8f6ce82272340ff1c"}, "cowboy": {:hex, :cowboy, "2.9.0", "865dd8b6607e14cf03282e10e934023a1bd8be6f6bacf921a7e2a96d800cd452", [:make, :rebar3], [{:cowlib, "2.11.0", [hex: :cowlib, repo: "hexpm", optional: false]}, {:ranch, "1.8.0", [hex: :ranch, repo: "hexpm", optional: false]}], "hexpm", "2c729f934b4e1aa149aff882f57c6372c15399a20d54f65c8d67bef583021bde"}, "cowboy_telemetry": {:hex, :cowboy_telemetry, "0.4.0", "f239f68b588efa7707abce16a84d0d2acf3a0f50571f8bb7f56a15865aae820c", [:rebar3], [{:cowboy, "~> 2.7", [hex: :cowboy, repo: "hexpm", optional: false]}, {:telemetry, "~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "7d98bac1ee4565d31b62d59f8823dfd8356a169e7fcbb83831b8a5397404c9de"}, "cowlib": {:hex, :cowlib, "2.11.0", "0b9ff9c346629256c42ebe1eeb769a83c6cb771a6ee5960bd110ab0b9b872063", [:make, :rebar3], [], "hexpm", "2b3e9da0b21c4565751a6d4901c20d1b4cc25cbb7fd50d91d2ab6dd287bc86a9"}, + "decimal": {:hex, :decimal, "2.1.1", "5611dca5d4b2c3dd497dec8f68751f1f1a54755e8ed2a966c2633cf885973ad6", [:mix], [], "hexpm", "53cfe5f497ed0e7771ae1a475575603d77425099ba5faef9394932b35020ffcc"}, "elixir_uuid": {:hex, :elixir_uuid, "1.2.1", "dce506597acb7e6b0daeaff52ff6a9043f5919a4c3315abb4143f0b00378c097", [:mix], [], "hexpm", "f7eba2ea6c3555cea09706492716b0d87397b88946e6380898c2889d68585752"}, + "ex_json_schema": {:hex, :ex_json_schema, "0.9.3", "fc17c50d410fd99fa6e814e1aed60122d8ff2578b869d17a9db1ce1c621382b6", [:mix], [{:decimal, "~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}], "hexpm", "b79962d09cefd33001706255187bdb483a0c2b4442d5edc6822eb7574a8df0a8"}, "floki": {:hex, :floki, "0.33.1", "f20f1eb471e726342b45ccb68edb9486729e7df94da403936ea94a794f072781", [:mix], [{:html_entities, "~> 0.5.0", [hex: :html_entities, repo: "hexpm", optional: false]}], "hexpm", "461035fd125f13fdf30f243c85a0b1e50afbec876cbf1ceefe6fddd2e6d712c6"}, - "gollum": {:hex, :new_gollum, "0.4.0", "89e3e2fc5abd032455341c4a03bcef7042b8d08e02c51df24b99a1a0a1ad69b1", [:mix], [{:httpoison, "~> 1.7", [hex: :httpoison, repo: "hexpm", optional: false]}], "hexpm", "85c68465e8678637638656945677062a4e7086e91a04d5c4bca1027321c74582"}, - "hackney": {:hex, :hackney, "1.18.1", "f48bf88f521f2a229fc7bae88cf4f85adc9cd9bcf23b5dc8eb6a1788c662c4f6", [:rebar3], [{:certifi, "~>2.9.0", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~>6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~>1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.3.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~>1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "a4ecdaff44297e9b5894ae499e9a070ea1888c84afdd1fd9b7b2bc384950128e"}, + "gollum": {:hex, :new_gollum, "0.5.0", "871dd0ee15c65b38932da5b6eac1413c2be96545d6cf5d6419081ce85a9a883a", [:mix], [{:httpoison, "~> 2.2", [hex: :httpoison, repo: "hexpm", optional: false]}], "hexpm", "81722a31ef162270432fbfc3dbf1f57d08530a9e572a57bc528748942d020f84"}, + "hackney": {:hex, :hackney, "1.20.1", "8d97aec62ddddd757d128bfd1df6c5861093419f8f7a4223823537bad5d064e2", [:rebar3], [{:certifi, "~> 2.12.0", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~> 6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~> 1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~> 1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.4.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~> 1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~> 0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "fe9094e5f1a2a2c0a7d10918fee36bfec0ec2a979994cff8cfe8058cd9af38e3"}, "html_entities": {:hex, :html_entities, "0.5.2", "9e47e70598da7de2a9ff6af8758399251db6dbb7eebe2b013f2bbd2515895c3c", [:mix], [], "hexpm", "c53ba390403485615623b9531e97696f076ed415e8d8058b1dbaa28181f4fdcc"}, - "httpoison": {:hex, :httpoison, "1.8.2", "9eb9c63ae289296a544842ef816a85d881d4a31f518a0fec089aaa744beae290", [:mix], [{:hackney, "~> 1.17", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "2bb350d26972e30c96e2ca74a1aaf8293d61d0742ff17f01e0279fef11599921"}, + "httpoison": {:hex, :httpoison, "2.2.1", "87b7ed6d95db0389f7df02779644171d7319d319178f6680438167d7b69b1f3d", [:mix], [{:hackney, "~> 1.17", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "51364e6d2f429d80e14fe4b5f8e39719cacd03eb3f9a9286e61e216feac2d2df"}, "idna": {:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d", [:rebar3], [{:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "92376eb7894412ed19ac475e4a86f7b413c1b9fbb5bd16dccd57934157944cea"}, "metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"}, "mime": {:hex, :mime, "2.0.3", "3676436d3d1f7b81b5a2d2bd8405f412c677558c81b1c92be58c00562bb59095", [:mix], [], "hexpm", "27a30bf0db44d25eecba73755acf4068cbfe26a4372f9eb3e4ea3a45956bff6b"}, - "mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm", "f278585650aa581986264638ebf698f8bb19df297f66ad91b18910dfc6e19323"}, - "parse_trans": {:hex, :parse_trans, "3.3.1", "16328ab840cc09919bd10dab29e431da3af9e9e7e7e6f0089dd5a2d2820011d8", [:rebar3], [], "hexpm", "07cd9577885f56362d414e8c4c4e6bdf10d43a8767abb92d24cbe8b24c54888b"}, + "mimerl": {:hex, :mimerl, "1.3.0", "d0cd9fc04b9061f82490f6581e0128379830e78535e017f7780f37fea7545726", [:rebar3], [], "hexpm", "a1e15a50d1887217de95f0b9b0793e32853f7c258a5cd227650889b38839fe9d"}, + "parse_trans": {:hex, :parse_trans, "3.4.1", "6e6aa8167cb44cc8f39441d05193be6e6f4e7c2946cb2759f015f8c56b76e5ff", [:rebar3], [], "hexpm", "620a406ce75dada827b82e453c19cf06776be266f5a67cff34e1ef2cbb60e49a"}, "plug": {:hex, :plug, "1.13.6", "187beb6b67c6cec50503e940f0434ea4692b19384d47e5fdfd701e93cadb4cc2", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:plug_crypto, "~> 1.1.1 or ~> 1.2", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4.3 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "02b9c6b9955bce92c829f31d6284bf53c591ca63c4fb9ff81dfd0418667a34ff"}, "plug_cowboy": {:hex, :plug_cowboy, "2.5.2", "62894ccd601cf9597e2c23911ff12798a8a18d237e9739f58a6b04e4988899fe", [:mix], [{:cowboy, "~> 2.7", [hex: :cowboy, repo: "hexpm", optional: false]}, {:cowboy_telemetry, "~> 0.3", [hex: :cowboy_telemetry, repo: "hexpm", optional: false]}, {:plug, "~> 1.7", [hex: :plug, repo: "hexpm", optional: false]}], "hexpm", "ea6e87f774c8608d60c8d34022a7d073bd7680a0a013f049fc62bf35efea1044"}, "plug_crypto": {:hex, :plug_crypto, "1.2.3", "8f77d13aeb32bfd9e654cb68f0af517b371fb34c56c9f2b58fe3df1235c1251a", [:mix], [], "hexpm", "b5672099c6ad5c202c45f5a403f21a3411247f164e4a8fab056e5cd8a290f4a2"}, "poison": {:hex, :poison, "3.1.0", "d9eb636610e096f86f25d9a46f35a9facac35609a7591b3be3326e99a0484665", [:mix], [], "hexpm", "fec8660eb7733ee4117b85f55799fd3833eb769a6df71ccf8903e8dc5447cfce"}, "ranch": {:hex, :ranch, "1.8.0", "8c7a100a139fd57f17327b6413e4167ac559fbc04ca7448e9be9057311597a1d", [:make, :rebar3], [], "hexpm", "49fbcfd3682fab1f5d109351b61257676da1a2fdbe295904176d5e521a2ddfe5"}, - "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.6", "cf344f5692c82d2cd7554f5ec8fd961548d4fd09e7d22f5b62482e5aeaebd4b0", [:make, :mix, :rebar3], [], "hexpm", "bdb0d2471f453c88ff3908e7686f86f9be327d065cc1ec16fa4540197ea04680"}, + "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.7", "354c321cf377240c7b8716899e182ce4890c5938111a1296add3ec74cf1715df", [:make, :mix, :rebar3], [], "hexpm", "fe4c190e8f37401d30167c8c405eda19469f34577987c76dde613e838bbc67f8"}, "telemetry": {:hex, :telemetry, "1.1.0", "a589817034a27eab11144ad24d5c0f9fab1f58173274b1e9bae7074af9cbee51", [:rebar3], [], "hexpm", "b727b2a1f75614774cff2d7565b64d0dfa5bd52ba517f16543e6fc7efcc0df48"}, "unicode_util_compat": {:hex, :unicode_util_compat, "0.7.0", "bc84380c9ab48177092f43ac89e4dfa2c6d62b40b8bd132b1059ecc7232f9a78", [:rebar3], [], "hexpm", "25eee6d67df61960cf6a794239566599b09e17e668d3700247bc498638152521"}, + "yamerl": {:hex, :yamerl, "0.10.0", "4ff81fee2f1f6a46f1700c0d880b24d193ddb74bd14ef42cb0bcf46e81ef2f8e", [:rebar3], [], "hexpm", "346adb2963f1051dc837a2364e4acf6eb7d80097c0f53cbdc3046ec8ec4b4e6e"}, + "yaml_elixir": {:hex, :yaml_elixir, "2.11.0", "9e9ccd134e861c66b84825a3542a1c22ba33f338d82c07282f4f1f52d847bd50", [:mix], [{:yamerl, "~> 0.10", [hex: :yamerl, repo: "hexpm", optional: false]}], "hexpm", "53cc28357ee7eb952344995787f4bb8cc3cecbf189652236e9b163e8ce1bc242"}, } diff --git a/lib/crawly.ex b/lib/crawly.ex index c1662ada..9e07a01d 100644 --- a/lib/crawly.ex +++ b/lib/crawly.ex @@ -6,102 +6,75 @@ defmodule Crawly do require Logger @doc """ - Fetches a given url. This function is mainly used for the spiders development - when you need to get individual pages and parse them. + Fetches the content from a given URL using the specified options. - The fetched URL is being converted to a request, and the request is piped - through the middlewares specified in a config (with the exception of - `Crawly.Middlewares.DomainFilter`, `Crawly.Middlewares.RobotsTxt`) + ## Parameters - Provide a spider with the `:with` option to fetch a given webpage using that spider. + - `url`: The URL to fetch the content from. It should be a valid string. + - `opts`: A keyword list of options to customize the request. The supported options are: + - `:headers` (optional): A list of HTTP headers to include in the request. Defaults to an empty list `[]`. + - `:request_opts` (optional): A list of options to pass to the HTTP client for configuring the request. Defaults to an empty list `[]`. + - `:fetcher` (optional): The module responsible for performing the HTTP request. This module must implement a `fetch/2` function. Defaults to `Crawly.Fetchers.HTTPoisonFetcher`. - ### Fetching with a spider - To fetch a response from a url with a spider, define your spider, and pass the module name to the `:with` option. + ## Returns - iex> Crawly.fetch("https://www.example.com", with: MySpider) - {%HTTPoison.Response{...}, %{...}, [...], %{...}} + - `{:ok, %HTTPoison.Response{}}`: On successful fetch, returns a tuple containing `:ok` and the HTTP response. + - `{:error, %HTTPoison.Error{}}`: On failure, returns a tuple containing `:error` and the error details. - Using the `:with` option will return a 4 item tuple: + ## Examples - 1. The HTTPoison response - 2. The result returned from the `parse_item/1` callback - 3. The list of items that have been processed by the declared item pipelines. - 4. The pipeline state, included for debugging purposes. + Fetch a URL with default options: + + iex> fetch("https://example.com") + {:ok, %HTTPoison.Response{status_code: 200, body: "...", ...}} + + Fetch a URL with custom headers: + + iex> fetch("https://example.com", headers: [{"User-Agent", "MyCrawler"}]) + {:ok, %HTTPoison.Response{status_code: 200, body: "...", ...}} + + Handle a fetch error: + + iex> fetch("https://invalid-url.com") + {:error, %HTTPoison.Error{id: nil, reason: :nxdomain}} + + ## Notes + + - The `fetcher` option allows you to customize how the HTTP request is performed. By default, the `Crawly.Fetchers.HTTPoisonFetcher` module is used, which relies on `HTTPoison` to perform the request. + - The `request_opts` parameter allows you to customize the behavior of the HTTP client, such as timeouts, SSL options, etc. + - The function returns either `{:ok, response}` for successful requests or `{:error, error}` for failed requests, allowing you to handle these cases explicitly in your code. """ - @type with_opt :: {:with, nil | module()} - @type request_opt :: {:request_options, list(Crawly.Request.option())} - @type headers_opt :: {:headers, list(Crawly.Request.header())} - - @type parsed_item_result :: Crawly.ParsedItem.t() - @type parsed_items :: list(any()) - @type pipeline_state :: %{optional(atom()) => any()} - @type spider :: module() - - @spec fetch(url, opts) :: - HTTPoison.Response.t() - | {HTTPoison.Response.t(), parsed_item_result, parsed_items, - pipeline_state} - when url: binary(), - opts: [ - with_opt - | request_opt - | headers_opt - ] + @spec fetch(url :: String.t(), options :: list()) :: + {:ok, HTTPoison.Response.t()} | {:error, HTTPoison.Error.t()} def fetch(url, opts \\ []) do - opts = Enum.into(opts, %{with: nil, request_options: [], headers: []}) - - request0 = - Crawly.Request.new(url, opts[:headers], opts[:request_options]) - |> Map.put( - :middlewares, - Crawly.Utils.get_settings(:middlewares, opts[:with], []) - ) - - ignored_middlewares = [ - Crawly.Middlewares.DomainFilter, - Crawly.Middlewares.RobotsTxt - ] - - new_middlewares = request0.middlewares -- ignored_middlewares - - request0 = - Map.put( - request0, - :middlewares, - new_middlewares - ) - - {%{} = request, _} = Crawly.Utils.pipe(request0.middlewares, request0, %{}) - {:ok, {response, _}} = Crawly.Worker.get_response({request, opts[:with]}) - - case opts[:with] do - nil -> - # no spider provided, return response as is - response - - _ -> - # spider provided, send response through parse_item callback, pipe through the pipelines - with {:ok, {parsed_result, _, _}} <- - Crawly.Worker.parse_item({response, opts[:with]}), - pipelines <- - Crawly.Utils.get_settings( - :pipelines, - opts[:with] - ), - items <- Map.get(parsed_result, :items, []), - {pipeline_result, pipeline_state} <- - Enum.reduce(items, {[], %{}}, fn item, {acc, state} -> - {piped, state} = Crawly.Utils.pipe(pipelines, item, state) - - if piped == false do - # dropped - {acc, state} - else - {[piped | acc], state} - end - end) do - {response, parsed_result, pipeline_result, pipeline_state} - end + headers = Keyword.get(opts, :headers, []) + request_opts = Keyword.get(opts, :request_opts, []) + fetcher = Keyword.get(opts, :fetcher, Crawly.Fetchers.HTTPoisonFetcher) + request = Crawly.Request.new(url, headers, request_opts) + fetcher.fetch(request, request_opts) + end + + @doc """ + Fetches content from the given URL and processes it with the specified spider. + + ## Parameters + + - `url`: The URL to fetch the content from. It should be a valid string. + - `spider_name`: The spider module responsible for processing the fetched response. The module must implement a `parse_item/1` function. + - `options`: A keyword list of options to customize the request. The options are passed directly to the `fetch/2` function. + + Returned Crawly.ParsedItem or HTTPoison error + """ + @spec fetch_with_spider( + url :: String.t(), + spider_name :: module(), + options :: list() + ) :: + Crawly.ParsedItem.t() | {:error, HTTPoison.Error.t()} + def fetch_with_spider(url, spider_name, options \\ []) do + case fetch(url, options) do + {:ok, response} -> spider_name.parse_item(response) + {:error, _reason} = err -> err end end diff --git a/test/crawly_test.exs b/test/crawly_test.exs index 65a68274..d310970d 100644 --- a/test/crawly_test.exs +++ b/test/crawly_test.exs @@ -1,44 +1,72 @@ defmodule CrawlyTest do use ExUnit.Case - setup do - :meck.new(CrawlyTestSpider, [:non_strict]) + describe "fetch/1" do + test "can fetch a given url" do + :meck.expect(HTTPoison, :get, fn _, _, _ -> + {:ok, %HTTPoison.Response{}} + end) - :meck.expect(CrawlyTestSpider, :parse_item, fn _resp -> - %{ - items: [%{content: "hello"}], - requests: [ - Crawly.Utils.request_from_url("https://www.example.com/test") - ] - } - end) + assert {:ok, %HTTPoison.Response{}} = Crawly.fetch("https://example.com") + end - :meck.expect(CrawlyTestSpider, :override_settings, fn -> - [pipelines: [Crawly.Pipelines.JSONEncoder]] - end) + test "returns error if unable to fetch the page" do + :meck.expect(HTTPoison, :get, fn _, _, _ -> + {:error, %HTTPoison.Error{}} + end) - :meck.expect(HTTPoison, :get, fn _, _, _ -> {:ok, %HTTPoison.Response{}} end) + assert {:error, %HTTPoison.Error{}} = Crawly.fetch("invalid-url") + end - on_exit(fn -> - :meck.unload() - end) + test "can fetch a given url with custom request options" do + request_opts = [timeout: 5000, recv_timeout: 5000] - {:ok, spider_module: CrawlyTestSpider} - end + :meck.expect(HTTPoison, :get, fn _, _, passed_request_opts -> + assert passed_request_opts == request_opts + {:ok, %HTTPoison.Response{}} + end) + + assert {:ok, %HTTPoison.Response{}} = + Crawly.fetch("https://example.com", request_opts: request_opts) + end + + test "can fetch a given url with headers" do + headers = [{"Authorization", "Bearer token"}] + + :meck.expect(HTTPoison, :get, fn _, headers_opts, _ -> + assert headers == headers_opts + {:ok, %HTTPoison.Response{}} + end) - test "fetch/1 is able to fetch a given url using global config, returns a response" do - assert %HTTPoison.Response{} = Crawly.fetch("https://example.com") + assert {:ok, %HTTPoison.Response{}} = + Crawly.fetch("https://example.com", headers: headers) + end end - test "fetch/2 with :with option provided returns the response, parsed_item result, and processed ParsedItems", - %{spider_module: spider_module} do - assert {%HTTPoison.Response{}, parsed_item_res, parsed_items, - _pipeline_state} = - Crawly.fetch("http://example.com", with: spider_module) + describe "fetch_with_spider/3" do + test "Can fetch a given url from behalf of the spider" do + expected_new_requests = [ + Crawly.Utils.request_from_url("https://www.example.com") + ] + + :meck.expect(HTTPoison, :get, fn _, _, _ -> + {:ok, %HTTPoison.Response{}} + end) + + :meck.new(CrawlyTestSpider, [:non_strict]) + + :meck.expect(CrawlyTestSpider, :parse_item, fn _resp -> + %{ + items: [%{content: "hello"}], + requests: expected_new_requests + } + end) - assert %{items: [_], requests: _requests} = parsed_item_res + %{requests: requests, items: items} = + Crawly.fetch_with_spider("https://example.com", CrawlyTestSpider) - assert [encoded] = parsed_items - assert encoded =~ "hello" + assert items == [%{content: "hello"}] + assert requests == expected_new_requests + end end end