Skip to content

Commit

Permalink
Take the RobotsTxt User-Agent from the Request (#294)
Browse files Browse the repository at this point in the history
* Get the correct user agent from the request

* Add debug logging to find out if it works as expected

* Unpack the user-agent correctly

* Use the enhanced gollum fork

* Handle a missing user-agent header

* Upgrade httppoison to 2.2

* Add a helper function to get headers

* Add a test for the user-agent

---------

Co-authored-by: Andreas Donig <[email protected]>
  • Loading branch information
adonig and Andreas Donig authored Apr 29, 2024
1 parent 2a6908f commit 711dba0
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 1 deletion.
7 changes: 6 additions & 1 deletion lib/crawly/middlewares/robotstxt.ex
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,15 @@ defmodule Crawly.Middlewares.RobotsTxt do
"""

@behaviour Crawly.Pipeline

require Logger

alias Crawly.Utils

def run(request, state, _opts \\ []) do
case Gollum.crawlable?("Crawly", request.url) do
user_agent = Utils.get_header(request.headers, "User-Agent", "Crawly")

case Gollum.crawlable?(user_agent, request.url) do
:uncrawlable ->
Logger.debug("Dropping request: #{request.url} (robots.txt filter)")

Expand Down
37 changes: 37 additions & 0 deletions lib/crawly/utils.ex
Original file line number Diff line number Diff line change
Expand Up @@ -457,4 +457,41 @@ defmodule Crawly.Utils do
raise "Invalid format: A #{setting} setting cannot be defined in the form `{#{inspect(x)}}`. Only the forms `{module, options}` and `module` are valid"
end
end

@doc """
Retrieves a header value from a list of key-value tuples or a map.
This function searches for a header with the specified key in the given list
of headers or map. If found, it returns the corresponding value; otherwise,
it returns the provided default value if provided, otherwise `nil`.
## Parameters
- `headers`: A list of key-value tuples or a map representing headers.
- `key`: The key of the header to retrieve.
- `default`: (Optional) The default value to return if the header is not found. If not provided, returns `nil`.
## Returns
The value of the header if found, otherwise the default value if provided, otherwise `nil`.
"""
@spec get_header(
headers :: [{atom | binary, binary}] | %{binary => binary},
key :: binary,
default :: binary | nil
) :: binary | nil
def get_header(headers, key, default \\ nil) do
downcased_key = String.downcase(key, :ascii)

Enum.find_value(headers, default, fn
{k, v} when is_atom(k) ->
if Atom.to_string(k) == downcased_key, do: v, else: nil

{k, v} ->
if String.downcase(k, :ascii) == downcased_key, do: v, else: nil

_ ->
nil
end)
end
end
22 changes: 22 additions & 0 deletions test/middlewares/robotstxt_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,26 @@ defmodule Middlewares.RobotsTxtTest do

assert {false, _state} = Crawly.Utils.pipe(middlewares, req, state)
end

test "Respects the User-Agent header when evaluating robots.txt" do
:meck.expect(Gollum, :crawlable?, fn
"My Custom Bot", _url -> :crawlable
_ua, _url -> :uncrawlable
end)

middlewares = [
{Crawly.Middlewares.UserAgent, user_agents: ["My Custom Bot"]},
Crawly.Middlewares.RobotsTxt
]

req = @valid
state = %{spider_name: :test_spider, crawl_id: "123"}

assert {%Crawly.Request{}, _state} =
Crawly.Utils.pipe(middlewares, req, state)

middlewares = [Crawly.Middlewares.RobotsTxt]

assert {false, _state} = Crawly.Utils.pipe(middlewares, req, state)
end
end

0 comments on commit 711dba0

Please sign in to comment.