Skip to content

Commit

Permalink
Simplify Crawly.fetch/2 function (#306)
Browse files Browse the repository at this point in the history
I decided to refactor this code a bit, so it is more simple (and also
credo was complaining about it). Hopefully Ziink will forgive me :)
  • Loading branch information
oltarasenko authored Aug 29, 2024
1 parent d62f5e7 commit 0f7a583
Show file tree
Hide file tree
Showing 4 changed files with 130 additions and 125 deletions.
2 changes: 1 addition & 1 deletion examples/quickstart/lib/quickstart/books_spider.ex
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ defmodule BooksToScrape do
end

@impl Crawly.Spider
def base_url(), do: ""
def base_url(), do: "https://books.toscrape.com/"

@impl Crawly.Spider
def parse_item(response) do
Expand Down
18 changes: 11 additions & 7 deletions examples/quickstart/mix.lock
Original file line number Diff line number Diff line change
@@ -1,25 +1,29 @@
%{
"certifi": {:hex, :certifi, "2.9.0", "6f2a475689dd47f19fb74334859d460a2dc4e3252a3324bd2111b8f0429e7e21", [:rebar3], [], "hexpm", "266da46bdb06d6c6d35fde799bcb28d36d985d424ad7c08b5bb48f5b5cdd4641"},
"certifi": {:hex, :certifi, "2.12.0", "2d1cca2ec95f59643862af91f001478c9863c2ac9cb6e2f89780bfd8de987329", [:rebar3], [], "hexpm", "ee68d85df22e554040cdb4be100f33873ac6051387baf6a8f6ce82272340ff1c"},
"cowboy": {:hex, :cowboy, "2.9.0", "865dd8b6607e14cf03282e10e934023a1bd8be6f6bacf921a7e2a96d800cd452", [:make, :rebar3], [{:cowlib, "2.11.0", [hex: :cowlib, repo: "hexpm", optional: false]}, {:ranch, "1.8.0", [hex: :ranch, repo: "hexpm", optional: false]}], "hexpm", "2c729f934b4e1aa149aff882f57c6372c15399a20d54f65c8d67bef583021bde"},
"cowboy_telemetry": {:hex, :cowboy_telemetry, "0.4.0", "f239f68b588efa7707abce16a84d0d2acf3a0f50571f8bb7f56a15865aae820c", [:rebar3], [{:cowboy, "~> 2.7", [hex: :cowboy, repo: "hexpm", optional: false]}, {:telemetry, "~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "7d98bac1ee4565d31b62d59f8823dfd8356a169e7fcbb83831b8a5397404c9de"},
"cowlib": {:hex, :cowlib, "2.11.0", "0b9ff9c346629256c42ebe1eeb769a83c6cb771a6ee5960bd110ab0b9b872063", [:make, :rebar3], [], "hexpm", "2b3e9da0b21c4565751a6d4901c20d1b4cc25cbb7fd50d91d2ab6dd287bc86a9"},
"decimal": {:hex, :decimal, "2.1.1", "5611dca5d4b2c3dd497dec8f68751f1f1a54755e8ed2a966c2633cf885973ad6", [:mix], [], "hexpm", "53cfe5f497ed0e7771ae1a475575603d77425099ba5faef9394932b35020ffcc"},
"elixir_uuid": {:hex, :elixir_uuid, "1.2.1", "dce506597acb7e6b0daeaff52ff6a9043f5919a4c3315abb4143f0b00378c097", [:mix], [], "hexpm", "f7eba2ea6c3555cea09706492716b0d87397b88946e6380898c2889d68585752"},
"ex_json_schema": {:hex, :ex_json_schema, "0.9.3", "fc17c50d410fd99fa6e814e1aed60122d8ff2578b869d17a9db1ce1c621382b6", [:mix], [{:decimal, "~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}], "hexpm", "b79962d09cefd33001706255187bdb483a0c2b4442d5edc6822eb7574a8df0a8"},
"floki": {:hex, :floki, "0.33.1", "f20f1eb471e726342b45ccb68edb9486729e7df94da403936ea94a794f072781", [:mix], [{:html_entities, "~> 0.5.0", [hex: :html_entities, repo: "hexpm", optional: false]}], "hexpm", "461035fd125f13fdf30f243c85a0b1e50afbec876cbf1ceefe6fddd2e6d712c6"},
"gollum": {:hex, :new_gollum, "0.4.0", "89e3e2fc5abd032455341c4a03bcef7042b8d08e02c51df24b99a1a0a1ad69b1", [:mix], [{:httpoison, "~> 1.7", [hex: :httpoison, repo: "hexpm", optional: false]}], "hexpm", "85c68465e8678637638656945677062a4e7086e91a04d5c4bca1027321c74582"},
"hackney": {:hex, :hackney, "1.18.1", "f48bf88f521f2a229fc7bae88cf4f85adc9cd9bcf23b5dc8eb6a1788c662c4f6", [:rebar3], [{:certifi, "~>2.9.0", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~>6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~>1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.3.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~>1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "a4ecdaff44297e9b5894ae499e9a070ea1888c84afdd1fd9b7b2bc384950128e"},
"gollum": {:hex, :new_gollum, "0.5.0", "871dd0ee15c65b38932da5b6eac1413c2be96545d6cf5d6419081ce85a9a883a", [:mix], [{:httpoison, "~> 2.2", [hex: :httpoison, repo: "hexpm", optional: false]}], "hexpm", "81722a31ef162270432fbfc3dbf1f57d08530a9e572a57bc528748942d020f84"},
"hackney": {:hex, :hackney, "1.20.1", "8d97aec62ddddd757d128bfd1df6c5861093419f8f7a4223823537bad5d064e2", [:rebar3], [{:certifi, "~> 2.12.0", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~> 6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~> 1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~> 1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.4.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~> 1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~> 0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "fe9094e5f1a2a2c0a7d10918fee36bfec0ec2a979994cff8cfe8058cd9af38e3"},
"html_entities": {:hex, :html_entities, "0.5.2", "9e47e70598da7de2a9ff6af8758399251db6dbb7eebe2b013f2bbd2515895c3c", [:mix], [], "hexpm", "c53ba390403485615623b9531e97696f076ed415e8d8058b1dbaa28181f4fdcc"},
"httpoison": {:hex, :httpoison, "1.8.2", "9eb9c63ae289296a544842ef816a85d881d4a31f518a0fec089aaa744beae290", [:mix], [{:hackney, "~> 1.17", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "2bb350d26972e30c96e2ca74a1aaf8293d61d0742ff17f01e0279fef11599921"},
"httpoison": {:hex, :httpoison, "2.2.1", "87b7ed6d95db0389f7df02779644171d7319d319178f6680438167d7b69b1f3d", [:mix], [{:hackney, "~> 1.17", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "51364e6d2f429d80e14fe4b5f8e39719cacd03eb3f9a9286e61e216feac2d2df"},
"idna": {:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d", [:rebar3], [{:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "92376eb7894412ed19ac475e4a86f7b413c1b9fbb5bd16dccd57934157944cea"},
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"},
"mime": {:hex, :mime, "2.0.3", "3676436d3d1f7b81b5a2d2bd8405f412c677558c81b1c92be58c00562bb59095", [:mix], [], "hexpm", "27a30bf0db44d25eecba73755acf4068cbfe26a4372f9eb3e4ea3a45956bff6b"},
"mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm", "f278585650aa581986264638ebf698f8bb19df297f66ad91b18910dfc6e19323"},
"parse_trans": {:hex, :parse_trans, "3.3.1", "16328ab840cc09919bd10dab29e431da3af9e9e7e7e6f0089dd5a2d2820011d8", [:rebar3], [], "hexpm", "07cd9577885f56362d414e8c4c4e6bdf10d43a8767abb92d24cbe8b24c54888b"},
"mimerl": {:hex, :mimerl, "1.3.0", "d0cd9fc04b9061f82490f6581e0128379830e78535e017f7780f37fea7545726", [:rebar3], [], "hexpm", "a1e15a50d1887217de95f0b9b0793e32853f7c258a5cd227650889b38839fe9d"},
"parse_trans": {:hex, :parse_trans, "3.4.1", "6e6aa8167cb44cc8f39441d05193be6e6f4e7c2946cb2759f015f8c56b76e5ff", [:rebar3], [], "hexpm", "620a406ce75dada827b82e453c19cf06776be266f5a67cff34e1ef2cbb60e49a"},
"plug": {:hex, :plug, "1.13.6", "187beb6b67c6cec50503e940f0434ea4692b19384d47e5fdfd701e93cadb4cc2", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:plug_crypto, "~> 1.1.1 or ~> 1.2", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4.3 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "02b9c6b9955bce92c829f31d6284bf53c591ca63c4fb9ff81dfd0418667a34ff"},
"plug_cowboy": {:hex, :plug_cowboy, "2.5.2", "62894ccd601cf9597e2c23911ff12798a8a18d237e9739f58a6b04e4988899fe", [:mix], [{:cowboy, "~> 2.7", [hex: :cowboy, repo: "hexpm", optional: false]}, {:cowboy_telemetry, "~> 0.3", [hex: :cowboy_telemetry, repo: "hexpm", optional: false]}, {:plug, "~> 1.7", [hex: :plug, repo: "hexpm", optional: false]}], "hexpm", "ea6e87f774c8608d60c8d34022a7d073bd7680a0a013f049fc62bf35efea1044"},
"plug_crypto": {:hex, :plug_crypto, "1.2.3", "8f77d13aeb32bfd9e654cb68f0af517b371fb34c56c9f2b58fe3df1235c1251a", [:mix], [], "hexpm", "b5672099c6ad5c202c45f5a403f21a3411247f164e4a8fab056e5cd8a290f4a2"},
"poison": {:hex, :poison, "3.1.0", "d9eb636610e096f86f25d9a46f35a9facac35609a7591b3be3326e99a0484665", [:mix], [], "hexpm", "fec8660eb7733ee4117b85f55799fd3833eb769a6df71ccf8903e8dc5447cfce"},
"ranch": {:hex, :ranch, "1.8.0", "8c7a100a139fd57f17327b6413e4167ac559fbc04ca7448e9be9057311597a1d", [:make, :rebar3], [], "hexpm", "49fbcfd3682fab1f5d109351b61257676da1a2fdbe295904176d5e521a2ddfe5"},
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.6", "cf344f5692c82d2cd7554f5ec8fd961548d4fd09e7d22f5b62482e5aeaebd4b0", [:make, :mix, :rebar3], [], "hexpm", "bdb0d2471f453c88ff3908e7686f86f9be327d065cc1ec16fa4540197ea04680"},
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.7", "354c321cf377240c7b8716899e182ce4890c5938111a1296add3ec74cf1715df", [:make, :mix, :rebar3], [], "hexpm", "fe4c190e8f37401d30167c8c405eda19469f34577987c76dde613e838bbc67f8"},
"telemetry": {:hex, :telemetry, "1.1.0", "a589817034a27eab11144ad24d5c0f9fab1f58173274b1e9bae7074af9cbee51", [:rebar3], [], "hexpm", "b727b2a1f75614774cff2d7565b64d0dfa5bd52ba517f16543e6fc7efcc0df48"},
"unicode_util_compat": {:hex, :unicode_util_compat, "0.7.0", "bc84380c9ab48177092f43ac89e4dfa2c6d62b40b8bd132b1059ecc7232f9a78", [:rebar3], [], "hexpm", "25eee6d67df61960cf6a794239566599b09e17e668d3700247bc498638152521"},
"yamerl": {:hex, :yamerl, "0.10.0", "4ff81fee2f1f6a46f1700c0d880b24d193ddb74bd14ef42cb0bcf46e81ef2f8e", [:rebar3], [], "hexpm", "346adb2963f1051dc837a2364e4acf6eb7d80097c0f53cbdc3046ec8ec4b4e6e"},
"yaml_elixir": {:hex, :yaml_elixir, "2.11.0", "9e9ccd134e861c66b84825a3542a1c22ba33f338d82c07282f4f1f52d847bd50", [:mix], [{:yamerl, "~> 0.10", [hex: :yamerl, repo: "hexpm", optional: false]}], "hexpm", "53cc28357ee7eb952344995787f4bb8cc3cecbf189652236e9b163e8ce1bc242"},
}
149 changes: 61 additions & 88 deletions lib/crawly.ex
Original file line number Diff line number Diff line change
Expand Up @@ -6,102 +6,75 @@ defmodule Crawly do
require Logger

@doc """
Fetches a given url. This function is mainly used for the spiders development
when you need to get individual pages and parse them.
Fetches the content from a given URL using the specified options.
The fetched URL is being converted to a request, and the request is piped
through the middlewares specified in a config (with the exception of
`Crawly.Middlewares.DomainFilter`, `Crawly.Middlewares.RobotsTxt`)
## Parameters
Provide a spider with the `:with` option to fetch a given webpage using that spider.
- `url`: The URL to fetch the content from. It should be a valid string.
- `opts`: A keyword list of options to customize the request. The supported options are:
- `:headers` (optional): A list of HTTP headers to include in the request. Defaults to an empty list `[]`.
- `:request_opts` (optional): A list of options to pass to the HTTP client for configuring the request. Defaults to an empty list `[]`.
- `:fetcher` (optional): The module responsible for performing the HTTP request. This module must implement a `fetch/2` function. Defaults to `Crawly.Fetchers.HTTPoisonFetcher`.
### Fetching with a spider
To fetch a response from a url with a spider, define your spider, and pass the module name to the `:with` option.
## Returns
iex> Crawly.fetch("https://www.example.com", with: MySpider)
{%HTTPoison.Response{...}, %{...}, [...], %{...}}
- `{:ok, %HTTPoison.Response{}}`: On successful fetch, returns a tuple containing `:ok` and the HTTP response.
- `{:error, %HTTPoison.Error{}}`: On failure, returns a tuple containing `:error` and the error details.
Using the `:with` option will return a 4 item tuple:
## Examples
1. The HTTPoison response
2. The result returned from the `parse_item/1` callback
3. The list of items that have been processed by the declared item pipelines.
4. The pipeline state, included for debugging purposes.
Fetch a URL with default options:
iex> fetch("https://example.com")
{:ok, %HTTPoison.Response{status_code: 200, body: "...", ...}}
Fetch a URL with custom headers:
iex> fetch("https://example.com", headers: [{"User-Agent", "MyCrawler"}])
{:ok, %HTTPoison.Response{status_code: 200, body: "...", ...}}
Handle a fetch error:
iex> fetch("https://invalid-url.com")
{:error, %HTTPoison.Error{id: nil, reason: :nxdomain}}
## Notes
- The `fetcher` option allows you to customize how the HTTP request is performed. By default, the `Crawly.Fetchers.HTTPoisonFetcher` module is used, which relies on `HTTPoison` to perform the request.
- The `request_opts` parameter allows you to customize the behavior of the HTTP client, such as timeouts, SSL options, etc.
- The function returns either `{:ok, response}` for successful requests or `{:error, error}` for failed requests, allowing you to handle these cases explicitly in your code.
"""
@type with_opt :: {:with, nil | module()}
@type request_opt :: {:request_options, list(Crawly.Request.option())}
@type headers_opt :: {:headers, list(Crawly.Request.header())}

@type parsed_item_result :: Crawly.ParsedItem.t()
@type parsed_items :: list(any())
@type pipeline_state :: %{optional(atom()) => any()}
@type spider :: module()

@spec fetch(url, opts) ::
HTTPoison.Response.t()
| {HTTPoison.Response.t(), parsed_item_result, parsed_items,
pipeline_state}
when url: binary(),
opts: [
with_opt
| request_opt
| headers_opt
]
@spec fetch(url :: String.t(), options :: list()) ::
{:ok, HTTPoison.Response.t()} | {:error, HTTPoison.Error.t()}
def fetch(url, opts \\ []) do
opts = Enum.into(opts, %{with: nil, request_options: [], headers: []})

request0 =
Crawly.Request.new(url, opts[:headers], opts[:request_options])
|> Map.put(
:middlewares,
Crawly.Utils.get_settings(:middlewares, opts[:with], [])
)

ignored_middlewares = [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.RobotsTxt
]

new_middlewares = request0.middlewares -- ignored_middlewares

request0 =
Map.put(
request0,
:middlewares,
new_middlewares
)

{%{} = request, _} = Crawly.Utils.pipe(request0.middlewares, request0, %{})
{:ok, {response, _}} = Crawly.Worker.get_response({request, opts[:with]})

case opts[:with] do
nil ->
# no spider provided, return response as is
response

_ ->
# spider provided, send response through parse_item callback, pipe through the pipelines
with {:ok, {parsed_result, _, _}} <-
Crawly.Worker.parse_item({response, opts[:with]}),
pipelines <-
Crawly.Utils.get_settings(
:pipelines,
opts[:with]
),
items <- Map.get(parsed_result, :items, []),
{pipeline_result, pipeline_state} <-
Enum.reduce(items, {[], %{}}, fn item, {acc, state} ->
{piped, state} = Crawly.Utils.pipe(pipelines, item, state)

if piped == false do
# dropped
{acc, state}
else
{[piped | acc], state}
end
end) do
{response, parsed_result, pipeline_result, pipeline_state}
end
headers = Keyword.get(opts, :headers, [])
request_opts = Keyword.get(opts, :request_opts, [])
fetcher = Keyword.get(opts, :fetcher, Crawly.Fetchers.HTTPoisonFetcher)
request = Crawly.Request.new(url, headers, request_opts)
fetcher.fetch(request, request_opts)
end

@doc """
Fetches content from the given URL and processes it with the specified spider.
## Parameters
- `url`: The URL to fetch the content from. It should be a valid string.
- `spider_name`: The spider module responsible for processing the fetched response. The module must implement a `parse_item/1` function.
- `options`: A keyword list of options to customize the request. The options are passed directly to the `fetch/2` function.
Returned Crawly.ParsedItem or HTTPoison error
"""
@spec fetch_with_spider(
url :: String.t(),
spider_name :: module(),
options :: list()
) ::
Crawly.ParsedItem.t() | {:error, HTTPoison.Error.t()}
def fetch_with_spider(url, spider_name, options \\ []) do
case fetch(url, options) do
{:ok, response} -> spider_name.parse_item(response)
{:error, _reason} = err -> err
end
end

Expand Down
86 changes: 57 additions & 29 deletions test/crawly_test.exs
Original file line number Diff line number Diff line change
@@ -1,44 +1,72 @@
defmodule CrawlyTest do
use ExUnit.Case

setup do
:meck.new(CrawlyTestSpider, [:non_strict])
describe "fetch/1" do
test "can fetch a given url" do
:meck.expect(HTTPoison, :get, fn _, _, _ ->
{:ok, %HTTPoison.Response{}}
end)

:meck.expect(CrawlyTestSpider, :parse_item, fn _resp ->
%{
items: [%{content: "hello"}],
requests: [
Crawly.Utils.request_from_url("https://www.example.com/test")
]
}
end)
assert {:ok, %HTTPoison.Response{}} = Crawly.fetch("https://example.com")
end

:meck.expect(CrawlyTestSpider, :override_settings, fn ->
[pipelines: [Crawly.Pipelines.JSONEncoder]]
end)
test "returns error if unable to fetch the page" do
:meck.expect(HTTPoison, :get, fn _, _, _ ->
{:error, %HTTPoison.Error{}}
end)

:meck.expect(HTTPoison, :get, fn _, _, _ -> {:ok, %HTTPoison.Response{}} end)
assert {:error, %HTTPoison.Error{}} = Crawly.fetch("invalid-url")
end

on_exit(fn ->
:meck.unload()
end)
test "can fetch a given url with custom request options" do
request_opts = [timeout: 5000, recv_timeout: 5000]

{:ok, spider_module: CrawlyTestSpider}
end
:meck.expect(HTTPoison, :get, fn _, _, passed_request_opts ->
assert passed_request_opts == request_opts
{:ok, %HTTPoison.Response{}}
end)

assert {:ok, %HTTPoison.Response{}} =
Crawly.fetch("https://example.com", request_opts: request_opts)
end

test "can fetch a given url with headers" do
headers = [{"Authorization", "Bearer token"}]

:meck.expect(HTTPoison, :get, fn _, headers_opts, _ ->
assert headers == headers_opts
{:ok, %HTTPoison.Response{}}
end)

test "fetch/1 is able to fetch a given url using global config, returns a response" do
assert %HTTPoison.Response{} = Crawly.fetch("https://example.com")
assert {:ok, %HTTPoison.Response{}} =
Crawly.fetch("https://example.com", headers: headers)
end
end

test "fetch/2 with :with option provided returns the response, parsed_item result, and processed ParsedItems",
%{spider_module: spider_module} do
assert {%HTTPoison.Response{}, parsed_item_res, parsed_items,
_pipeline_state} =
Crawly.fetch("http://example.com", with: spider_module)
describe "fetch_with_spider/3" do
test "Can fetch a given url from behalf of the spider" do
expected_new_requests = [
Crawly.Utils.request_from_url("https://www.example.com")
]

:meck.expect(HTTPoison, :get, fn _, _, _ ->
{:ok, %HTTPoison.Response{}}
end)

:meck.new(CrawlyTestSpider, [:non_strict])

:meck.expect(CrawlyTestSpider, :parse_item, fn _resp ->
%{
items: [%{content: "hello"}],
requests: expected_new_requests
}
end)

assert %{items: [_], requests: _requests} = parsed_item_res
%{requests: requests, items: items} =
Crawly.fetch_with_spider("https://example.com", CrawlyTestSpider)

assert [encoded] = parsed_items
assert encoded =~ "hello"
assert items == [%{content: "hello"}]
assert requests == expected_new_requests
end
end
end

0 comments on commit 0f7a583

Please sign in to comment.