Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify Crawly.fetch/2 function #306

Merged
merged 1 commit into from
Aug 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/quickstart/lib/quickstart/books_spider.ex
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ defmodule BooksToScrape do
end

@impl Crawly.Spider
def base_url(), do: ""
def base_url(), do: "https://books.toscrape.com/"

@impl Crawly.Spider
def parse_item(response) do
Expand Down
18 changes: 11 additions & 7 deletions examples/quickstart/mix.lock
Original file line number Diff line number Diff line change
@@ -1,25 +1,29 @@
%{
"certifi": {:hex, :certifi, "2.9.0", "6f2a475689dd47f19fb74334859d460a2dc4e3252a3324bd2111b8f0429e7e21", [:rebar3], [], "hexpm", "266da46bdb06d6c6d35fde799bcb28d36d985d424ad7c08b5bb48f5b5cdd4641"},
"certifi": {:hex, :certifi, "2.12.0", "2d1cca2ec95f59643862af91f001478c9863c2ac9cb6e2f89780bfd8de987329", [:rebar3], [], "hexpm", "ee68d85df22e554040cdb4be100f33873ac6051387baf6a8f6ce82272340ff1c"},
"cowboy": {:hex, :cowboy, "2.9.0", "865dd8b6607e14cf03282e10e934023a1bd8be6f6bacf921a7e2a96d800cd452", [:make, :rebar3], [{:cowlib, "2.11.0", [hex: :cowlib, repo: "hexpm", optional: false]}, {:ranch, "1.8.0", [hex: :ranch, repo: "hexpm", optional: false]}], "hexpm", "2c729f934b4e1aa149aff882f57c6372c15399a20d54f65c8d67bef583021bde"},
"cowboy_telemetry": {:hex, :cowboy_telemetry, "0.4.0", "f239f68b588efa7707abce16a84d0d2acf3a0f50571f8bb7f56a15865aae820c", [:rebar3], [{:cowboy, "~> 2.7", [hex: :cowboy, repo: "hexpm", optional: false]}, {:telemetry, "~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "7d98bac1ee4565d31b62d59f8823dfd8356a169e7fcbb83831b8a5397404c9de"},
"cowlib": {:hex, :cowlib, "2.11.0", "0b9ff9c346629256c42ebe1eeb769a83c6cb771a6ee5960bd110ab0b9b872063", [:make, :rebar3], [], "hexpm", "2b3e9da0b21c4565751a6d4901c20d1b4cc25cbb7fd50d91d2ab6dd287bc86a9"},
"decimal": {:hex, :decimal, "2.1.1", "5611dca5d4b2c3dd497dec8f68751f1f1a54755e8ed2a966c2633cf885973ad6", [:mix], [], "hexpm", "53cfe5f497ed0e7771ae1a475575603d77425099ba5faef9394932b35020ffcc"},
"elixir_uuid": {:hex, :elixir_uuid, "1.2.1", "dce506597acb7e6b0daeaff52ff6a9043f5919a4c3315abb4143f0b00378c097", [:mix], [], "hexpm", "f7eba2ea6c3555cea09706492716b0d87397b88946e6380898c2889d68585752"},
"ex_json_schema": {:hex, :ex_json_schema, "0.9.3", "fc17c50d410fd99fa6e814e1aed60122d8ff2578b869d17a9db1ce1c621382b6", [:mix], [{:decimal, "~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}], "hexpm", "b79962d09cefd33001706255187bdb483a0c2b4442d5edc6822eb7574a8df0a8"},
"floki": {:hex, :floki, "0.33.1", "f20f1eb471e726342b45ccb68edb9486729e7df94da403936ea94a794f072781", [:mix], [{:html_entities, "~> 0.5.0", [hex: :html_entities, repo: "hexpm", optional: false]}], "hexpm", "461035fd125f13fdf30f243c85a0b1e50afbec876cbf1ceefe6fddd2e6d712c6"},
"gollum": {:hex, :new_gollum, "0.4.0", "89e3e2fc5abd032455341c4a03bcef7042b8d08e02c51df24b99a1a0a1ad69b1", [:mix], [{:httpoison, "~> 1.7", [hex: :httpoison, repo: "hexpm", optional: false]}], "hexpm", "85c68465e8678637638656945677062a4e7086e91a04d5c4bca1027321c74582"},
"hackney": {:hex, :hackney, "1.18.1", "f48bf88f521f2a229fc7bae88cf4f85adc9cd9bcf23b5dc8eb6a1788c662c4f6", [:rebar3], [{:certifi, "~>2.9.0", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~>6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~>1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.3.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~>1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "a4ecdaff44297e9b5894ae499e9a070ea1888c84afdd1fd9b7b2bc384950128e"},
"gollum": {:hex, :new_gollum, "0.5.0", "871dd0ee15c65b38932da5b6eac1413c2be96545d6cf5d6419081ce85a9a883a", [:mix], [{:httpoison, "~> 2.2", [hex: :httpoison, repo: "hexpm", optional: false]}], "hexpm", "81722a31ef162270432fbfc3dbf1f57d08530a9e572a57bc528748942d020f84"},
"hackney": {:hex, :hackney, "1.20.1", "8d97aec62ddddd757d128bfd1df6c5861093419f8f7a4223823537bad5d064e2", [:rebar3], [{:certifi, "~> 2.12.0", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~> 6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~> 1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~> 1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.4.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~> 1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~> 0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "fe9094e5f1a2a2c0a7d10918fee36bfec0ec2a979994cff8cfe8058cd9af38e3"},
"html_entities": {:hex, :html_entities, "0.5.2", "9e47e70598da7de2a9ff6af8758399251db6dbb7eebe2b013f2bbd2515895c3c", [:mix], [], "hexpm", "c53ba390403485615623b9531e97696f076ed415e8d8058b1dbaa28181f4fdcc"},
"httpoison": {:hex, :httpoison, "1.8.2", "9eb9c63ae289296a544842ef816a85d881d4a31f518a0fec089aaa744beae290", [:mix], [{:hackney, "~> 1.17", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "2bb350d26972e30c96e2ca74a1aaf8293d61d0742ff17f01e0279fef11599921"},
"httpoison": {:hex, :httpoison, "2.2.1", "87b7ed6d95db0389f7df02779644171d7319d319178f6680438167d7b69b1f3d", [:mix], [{:hackney, "~> 1.17", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "51364e6d2f429d80e14fe4b5f8e39719cacd03eb3f9a9286e61e216feac2d2df"},
"idna": {:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d", [:rebar3], [{:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "92376eb7894412ed19ac475e4a86f7b413c1b9fbb5bd16dccd57934157944cea"},
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"},
"mime": {:hex, :mime, "2.0.3", "3676436d3d1f7b81b5a2d2bd8405f412c677558c81b1c92be58c00562bb59095", [:mix], [], "hexpm", "27a30bf0db44d25eecba73755acf4068cbfe26a4372f9eb3e4ea3a45956bff6b"},
"mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm", "f278585650aa581986264638ebf698f8bb19df297f66ad91b18910dfc6e19323"},
"parse_trans": {:hex, :parse_trans, "3.3.1", "16328ab840cc09919bd10dab29e431da3af9e9e7e7e6f0089dd5a2d2820011d8", [:rebar3], [], "hexpm", "07cd9577885f56362d414e8c4c4e6bdf10d43a8767abb92d24cbe8b24c54888b"},
"mimerl": {:hex, :mimerl, "1.3.0", "d0cd9fc04b9061f82490f6581e0128379830e78535e017f7780f37fea7545726", [:rebar3], [], "hexpm", "a1e15a50d1887217de95f0b9b0793e32853f7c258a5cd227650889b38839fe9d"},
"parse_trans": {:hex, :parse_trans, "3.4.1", "6e6aa8167cb44cc8f39441d05193be6e6f4e7c2946cb2759f015f8c56b76e5ff", [:rebar3], [], "hexpm", "620a406ce75dada827b82e453c19cf06776be266f5a67cff34e1ef2cbb60e49a"},
"plug": {:hex, :plug, "1.13.6", "187beb6b67c6cec50503e940f0434ea4692b19384d47e5fdfd701e93cadb4cc2", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:plug_crypto, "~> 1.1.1 or ~> 1.2", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4.3 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "02b9c6b9955bce92c829f31d6284bf53c591ca63c4fb9ff81dfd0418667a34ff"},
"plug_cowboy": {:hex, :plug_cowboy, "2.5.2", "62894ccd601cf9597e2c23911ff12798a8a18d237e9739f58a6b04e4988899fe", [:mix], [{:cowboy, "~> 2.7", [hex: :cowboy, repo: "hexpm", optional: false]}, {:cowboy_telemetry, "~> 0.3", [hex: :cowboy_telemetry, repo: "hexpm", optional: false]}, {:plug, "~> 1.7", [hex: :plug, repo: "hexpm", optional: false]}], "hexpm", "ea6e87f774c8608d60c8d34022a7d073bd7680a0a013f049fc62bf35efea1044"},
"plug_crypto": {:hex, :plug_crypto, "1.2.3", "8f77d13aeb32bfd9e654cb68f0af517b371fb34c56c9f2b58fe3df1235c1251a", [:mix], [], "hexpm", "b5672099c6ad5c202c45f5a403f21a3411247f164e4a8fab056e5cd8a290f4a2"},
"poison": {:hex, :poison, "3.1.0", "d9eb636610e096f86f25d9a46f35a9facac35609a7591b3be3326e99a0484665", [:mix], [], "hexpm", "fec8660eb7733ee4117b85f55799fd3833eb769a6df71ccf8903e8dc5447cfce"},
"ranch": {:hex, :ranch, "1.8.0", "8c7a100a139fd57f17327b6413e4167ac559fbc04ca7448e9be9057311597a1d", [:make, :rebar3], [], "hexpm", "49fbcfd3682fab1f5d109351b61257676da1a2fdbe295904176d5e521a2ddfe5"},
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.6", "cf344f5692c82d2cd7554f5ec8fd961548d4fd09e7d22f5b62482e5aeaebd4b0", [:make, :mix, :rebar3], [], "hexpm", "bdb0d2471f453c88ff3908e7686f86f9be327d065cc1ec16fa4540197ea04680"},
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.7", "354c321cf377240c7b8716899e182ce4890c5938111a1296add3ec74cf1715df", [:make, :mix, :rebar3], [], "hexpm", "fe4c190e8f37401d30167c8c405eda19469f34577987c76dde613e838bbc67f8"},
"telemetry": {:hex, :telemetry, "1.1.0", "a589817034a27eab11144ad24d5c0f9fab1f58173274b1e9bae7074af9cbee51", [:rebar3], [], "hexpm", "b727b2a1f75614774cff2d7565b64d0dfa5bd52ba517f16543e6fc7efcc0df48"},
"unicode_util_compat": {:hex, :unicode_util_compat, "0.7.0", "bc84380c9ab48177092f43ac89e4dfa2c6d62b40b8bd132b1059ecc7232f9a78", [:rebar3], [], "hexpm", "25eee6d67df61960cf6a794239566599b09e17e668d3700247bc498638152521"},
"yamerl": {:hex, :yamerl, "0.10.0", "4ff81fee2f1f6a46f1700c0d880b24d193ddb74bd14ef42cb0bcf46e81ef2f8e", [:rebar3], [], "hexpm", "346adb2963f1051dc837a2364e4acf6eb7d80097c0f53cbdc3046ec8ec4b4e6e"},
"yaml_elixir": {:hex, :yaml_elixir, "2.11.0", "9e9ccd134e861c66b84825a3542a1c22ba33f338d82c07282f4f1f52d847bd50", [:mix], [{:yamerl, "~> 0.10", [hex: :yamerl, repo: "hexpm", optional: false]}], "hexpm", "53cc28357ee7eb952344995787f4bb8cc3cecbf189652236e9b163e8ce1bc242"},
}
149 changes: 61 additions & 88 deletions lib/crawly.ex
Original file line number Diff line number Diff line change
Expand Up @@ -6,102 +6,75 @@ defmodule Crawly do
require Logger

@doc """
Fetches a given url. This function is mainly used for the spiders development
when you need to get individual pages and parse them.
Fetches the content from a given URL using the specified options.

The fetched URL is being converted to a request, and the request is piped
through the middlewares specified in a config (with the exception of
`Crawly.Middlewares.DomainFilter`, `Crawly.Middlewares.RobotsTxt`)
## Parameters

Provide a spider with the `:with` option to fetch a given webpage using that spider.
- `url`: The URL to fetch the content from. It should be a valid string.
- `opts`: A keyword list of options to customize the request. The supported options are:
- `:headers` (optional): A list of HTTP headers to include in the request. Defaults to an empty list `[]`.
- `:request_opts` (optional): A list of options to pass to the HTTP client for configuring the request. Defaults to an empty list `[]`.
- `:fetcher` (optional): The module responsible for performing the HTTP request. This module must implement a `fetch/2` function. Defaults to `Crawly.Fetchers.HTTPoisonFetcher`.

### Fetching with a spider
To fetch a response from a url with a spider, define your spider, and pass the module name to the `:with` option.
## Returns

iex> Crawly.fetch("https://www.example.com", with: MySpider)
{%HTTPoison.Response{...}, %{...}, [...], %{...}}
- `{:ok, %HTTPoison.Response{}}`: On successful fetch, returns a tuple containing `:ok` and the HTTP response.
- `{:error, %HTTPoison.Error{}}`: On failure, returns a tuple containing `:error` and the error details.

Using the `:with` option will return a 4 item tuple:
## Examples

1. The HTTPoison response
2. The result returned from the `parse_item/1` callback
3. The list of items that have been processed by the declared item pipelines.
4. The pipeline state, included for debugging purposes.
Fetch a URL with default options:

iex> fetch("https://example.com")
{:ok, %HTTPoison.Response{status_code: 200, body: "...", ...}}

Fetch a URL with custom headers:

iex> fetch("https://example.com", headers: [{"User-Agent", "MyCrawler"}])
{:ok, %HTTPoison.Response{status_code: 200, body: "...", ...}}

Handle a fetch error:

iex> fetch("https://invalid-url.com")
{:error, %HTTPoison.Error{id: nil, reason: :nxdomain}}

## Notes

- The `fetcher` option allows you to customize how the HTTP request is performed. By default, the `Crawly.Fetchers.HTTPoisonFetcher` module is used, which relies on `HTTPoison` to perform the request.
- The `request_opts` parameter allows you to customize the behavior of the HTTP client, such as timeouts, SSL options, etc.
- The function returns either `{:ok, response}` for successful requests or `{:error, error}` for failed requests, allowing you to handle these cases explicitly in your code.
"""
@type with_opt :: {:with, nil | module()}
@type request_opt :: {:request_options, list(Crawly.Request.option())}
@type headers_opt :: {:headers, list(Crawly.Request.header())}

@type parsed_item_result :: Crawly.ParsedItem.t()
@type parsed_items :: list(any())
@type pipeline_state :: %{optional(atom()) => any()}
@type spider :: module()

@spec fetch(url, opts) ::
HTTPoison.Response.t()
| {HTTPoison.Response.t(), parsed_item_result, parsed_items,
pipeline_state}
when url: binary(),
opts: [
with_opt
| request_opt
| headers_opt
]
@spec fetch(url :: String.t(), options :: list()) ::
{:ok, HTTPoison.Response.t()} | {:error, HTTPoison.Error.t()}
def fetch(url, opts \\ []) do
opts = Enum.into(opts, %{with: nil, request_options: [], headers: []})

request0 =
Crawly.Request.new(url, opts[:headers], opts[:request_options])
|> Map.put(
:middlewares,
Crawly.Utils.get_settings(:middlewares, opts[:with], [])
)

ignored_middlewares = [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.RobotsTxt
]

new_middlewares = request0.middlewares -- ignored_middlewares

request0 =
Map.put(
request0,
:middlewares,
new_middlewares
)

{%{} = request, _} = Crawly.Utils.pipe(request0.middlewares, request0, %{})
{:ok, {response, _}} = Crawly.Worker.get_response({request, opts[:with]})

case opts[:with] do
nil ->
# no spider provided, return response as is
response

_ ->
# spider provided, send response through parse_item callback, pipe through the pipelines
with {:ok, {parsed_result, _, _}} <-
Crawly.Worker.parse_item({response, opts[:with]}),
pipelines <-
Crawly.Utils.get_settings(
:pipelines,
opts[:with]
),
items <- Map.get(parsed_result, :items, []),
{pipeline_result, pipeline_state} <-
Enum.reduce(items, {[], %{}}, fn item, {acc, state} ->
{piped, state} = Crawly.Utils.pipe(pipelines, item, state)

if piped == false do
# dropped
{acc, state}
else
{[piped | acc], state}
end
end) do
{response, parsed_result, pipeline_result, pipeline_state}
end
headers = Keyword.get(opts, :headers, [])
request_opts = Keyword.get(opts, :request_opts, [])
fetcher = Keyword.get(opts, :fetcher, Crawly.Fetchers.HTTPoisonFetcher)
request = Crawly.Request.new(url, headers, request_opts)
fetcher.fetch(request, request_opts)
end

@doc """
Fetches content from the given URL and processes it with the specified spider.

## Parameters

- `url`: The URL to fetch the content from. It should be a valid string.
- `spider_name`: The spider module responsible for processing the fetched response. The module must implement a `parse_item/1` function.
- `options`: A keyword list of options to customize the request. The options are passed directly to the `fetch/2` function.

Returned Crawly.ParsedItem or HTTPoison error
"""
@spec fetch_with_spider(
url :: String.t(),
spider_name :: module(),
options :: list()
) ::
Crawly.ParsedItem.t() | {:error, HTTPoison.Error.t()}
def fetch_with_spider(url, spider_name, options \\ []) do
case fetch(url, options) do
{:ok, response} -> spider_name.parse_item(response)
{:error, _reason} = err -> err
end
end

Expand Down
86 changes: 57 additions & 29 deletions test/crawly_test.exs
Original file line number Diff line number Diff line change
@@ -1,44 +1,72 @@
defmodule CrawlyTest do
use ExUnit.Case

setup do
:meck.new(CrawlyTestSpider, [:non_strict])
describe "fetch/1" do
test "can fetch a given url" do
:meck.expect(HTTPoison, :get, fn _, _, _ ->
{:ok, %HTTPoison.Response{}}
end)

:meck.expect(CrawlyTestSpider, :parse_item, fn _resp ->
%{
items: [%{content: "hello"}],
requests: [
Crawly.Utils.request_from_url("https://www.example.com/test")
]
}
end)
assert {:ok, %HTTPoison.Response{}} = Crawly.fetch("https://example.com")
end

:meck.expect(CrawlyTestSpider, :override_settings, fn ->
[pipelines: [Crawly.Pipelines.JSONEncoder]]
end)
test "returns error if unable to fetch the page" do
:meck.expect(HTTPoison, :get, fn _, _, _ ->
{:error, %HTTPoison.Error{}}
end)

:meck.expect(HTTPoison, :get, fn _, _, _ -> {:ok, %HTTPoison.Response{}} end)
assert {:error, %HTTPoison.Error{}} = Crawly.fetch("invalid-url")
end

on_exit(fn ->
:meck.unload()
end)
test "can fetch a given url with custom request options" do
request_opts = [timeout: 5000, recv_timeout: 5000]

{:ok, spider_module: CrawlyTestSpider}
end
:meck.expect(HTTPoison, :get, fn _, _, passed_request_opts ->
assert passed_request_opts == request_opts
{:ok, %HTTPoison.Response{}}
end)

assert {:ok, %HTTPoison.Response{}} =
Crawly.fetch("https://example.com", request_opts: request_opts)
end

test "can fetch a given url with headers" do
headers = [{"Authorization", "Bearer token"}]

:meck.expect(HTTPoison, :get, fn _, headers_opts, _ ->
assert headers == headers_opts
{:ok, %HTTPoison.Response{}}
end)

test "fetch/1 is able to fetch a given url using global config, returns a response" do
assert %HTTPoison.Response{} = Crawly.fetch("https://example.com")
assert {:ok, %HTTPoison.Response{}} =
Crawly.fetch("https://example.com", headers: headers)
end
end

test "fetch/2 with :with option provided returns the response, parsed_item result, and processed ParsedItems",
%{spider_module: spider_module} do
assert {%HTTPoison.Response{}, parsed_item_res, parsed_items,
_pipeline_state} =
Crawly.fetch("http://example.com", with: spider_module)
describe "fetch_with_spider/3" do
test "Can fetch a given url from behalf of the spider" do
expected_new_requests = [
Crawly.Utils.request_from_url("https://www.example.com")
]

:meck.expect(HTTPoison, :get, fn _, _, _ ->
{:ok, %HTTPoison.Response{}}
end)

:meck.new(CrawlyTestSpider, [:non_strict])

:meck.expect(CrawlyTestSpider, :parse_item, fn _resp ->
%{
items: [%{content: "hello"}],
requests: expected_new_requests
}
end)

assert %{items: [_], requests: _requests} = parsed_item_res
%{requests: requests, items: items} =
Crawly.fetch_with_spider("https://example.com", CrawlyTestSpider)

assert [encoded] = parsed_items
assert encoded =~ "hello"
assert items == [%{content: "hello"}]
assert requests == expected_new_requests
end
end
end
Loading