diff --git a/documentation/basic_concepts.md b/documentation/basic_concepts.md index 4b668be8..8bd4944c 100644 --- a/documentation/basic_concepts.md +++ b/documentation/basic_concepts.md @@ -340,8 +340,40 @@ Crawly's codebase contains a special Splash fetcher, which allows to do the brow rendering before the page content is being parsed by a spider. Also it's possible to build own fetchers. + +### Using crawly-render-server for browser rendering + +NOTE: Experimental + +I have made a simple puppeteer based browser rendering tool that's available +here: https://github.com/elixir-crawly/crawly-render-server + +I am actively testing it with various targets, and at least for me +the results looks fine. However I am super interested in other feedback +or contributions. + +To run it do this: +1. git clone https://github.com/yourusername/crawly-render-server.git +2. cd ./crawly-render-server +3. docker run -p 3000:3000 --rm -it $(docker build -q .) +4. configure it on project or spider level: + +(project level) +``` +import Config + +config :crawly, + fetcher: {Crawly.Fetchers.CrawlyRenderServer, [base_url: "http://localhost:3000/render"]}, +``` + + ### Using splash fetcher for browser rendering +NOTE: It looks like splash is not maintained anymore. + +We could not run it's Docker images on M1/M2 mac machines. We could not +build it from sources as well :( + Splash is a lightweight opensourse browser implementation built with QT and python. See: https://splash.readthedocs.io/en/stable/api.html diff --git a/lib/crawly/fetchers/crawly_render_server.ex b/lib/crawly/fetchers/crawly_render_server.ex new file mode 100644 index 00000000..652affb6 --- /dev/null +++ b/lib/crawly/fetchers/crawly_render_server.ex @@ -0,0 +1,72 @@ +defmodule Crawly.Fetchers.CrawlyRenderServer do + @moduledoc """ + Implements Crawly.Fetchers.Fetcher behavior for Crawly Render Server + Javascript rendering. + + Crawly Render Server is a lightweight puppeteer based Javascript rendering + engine server. Quite experimental. See more: + https://github.com/elixir-crawly/crawly-render-server + + It exposes /render endpoint that renders JS on incoming requests. For example: + curl -X POST \ + http://localhost:3000/render \ + -H 'Content-Type: application/json' \ + -d '{ + "url": "https://example.com", + "headers": {"User-Agent": "Custom User Agent"} + }' + + In this case you have to configure the fetcher in the following way: + `fetcher: {Crawly.Fetchers.CrawlyRenderServer, [base_url: "http://localhost:3000/render"]}` + """ + @behaviour Crawly.Fetchers.Fetcher + + require Logger + + def fetch(request, client_options) do + base_url = + case Keyword.get(client_options, :base_url, nil) do + nil -> + Logger.error( + "The base_url is not set. CrawlyRenderServer can't be used! " <> + "Please set :base_url in fetcher options to continue. " <> + "For example: " <> + "fetcher: {Crawly.Fetchers.CrawlyRenderServer, [base_url: ]}" + ) + + raise RuntimeError + + base_url -> + base_url + end + + req_body = + Poison.encode!(%{ + url: request.url, + headers: Map.new(request.headers) + }) + + case HTTPoison.post( + base_url, + req_body, + [{"content-type", "application/json"}], + request.options + ) do + {:ok, response} -> + js = Poison.decode!(response.body) + + new_response = %HTTPoison.Response{ + body: Map.get(js, "page"), + status_code: Map.get(js, "status"), + headers: Map.get(js, "headers"), + request_url: request.url, + request: request + } + + {:ok, new_response} + + err -> + err + end + end +end diff --git a/test/fetchers/crawly_render_server_test.exs b/test/fetchers/crawly_render_server_test.exs new file mode 100644 index 00000000..2f408a10 --- /dev/null +++ b/test/fetchers/crawly_render_server_test.exs @@ -0,0 +1,38 @@ +defmodule Crawly.Fetchers.CrawlyRenderServerTest do + use ExUnit.Case + import Crawly.Fetchers.CrawlyRenderServer + + test "throws an error when base_url is not set" do + request = %{ + url: "https://example.com", + headers: %{"User-Agent" => "Custom User Agent"} + } + + client_options = [] + + assert_raise RuntimeError, fn -> + fetch(request, client_options) + end + end + + test "composes correct request to render server" do + request = %{ + url: "https://example.com", + headers: [{"User-Agent", "Custom User Agent"}], + options: [] + } + + client_options = [base_url: "http://localhost:3000"] + + :meck.expect(HTTPoison, :post, fn base_url, body, headers, _options -> + assert headers == [{"content-type", "application/json"}] + assert base_url == "http://localhost:3000" + + body = Poison.decode!(body, %{keys: :atoms}) + assert "https://example.com" == body.url + assert %{:"User-Agent" => "Custom User Agent"} == body.headers + end) + + fetch(request, client_options) + end +end