elixir-crawly · oltarasenko · Apr 9, 2024 · Aug 21, 2023
diff --git a/documentation/basic_concepts.md b/documentation/basic_concepts.md
@@ -340,8 +340,40 @@ Crawly's codebase contains a special Splash fetcher, which allows to do the brow
 rendering before the page content is being parsed by a spider. Also it's possible
 to build own fetchers.
 
+
+### Using crawly-render-server for browser rendering
+
+NOTE: Experimental
+
+I have made a simple puppeteer based browser rendering tool that's available
+here: https://github.com/elixir-crawly/crawly-render-server
+
+I am actively testing it with various targets, and at least for me
+the results looks fine. However I am super interested in other feedback
+or contributions.
+
+To run it do this:
+1. git clone https://github.com/yourusername/crawly-render-server.git
+2. cd ./crawly-render-server
+3. docker run -p 3000:3000  --rm -it $(docker build -q .)
+4. configure it on project or spider level:
+
+(project level)
+```
+import Config
+
+config :crawly,
+  fetcher: {Crawly.Fetchers.CrawlyRenderServer, [base_url: "http://localhost:3000/render"]},
+```
+
+
 ### Using splash fetcher for browser rendering
 
+NOTE: It looks like splash is not maintained anymore.
+
+We could not run it's Docker images on M1/M2 mac machines. We could not
+build it from sources as well :(
+
 Splash is a lightweight opensourse browser implementation built with QT and python.
 See: https://splash.readthedocs.io/en/stable/api.html
 

diff --git a/lib/crawly/fetchers/crawly_render_server.ex b/lib/crawly/fetchers/crawly_render_server.ex
@@ -0,0 +1,72 @@
+defmodule Crawly.Fetchers.CrawlyRenderServer do
+  @moduledoc """
+  Implements Crawly.Fetchers.Fetcher behavior for Crawly Render Server
+  Javascript rendering.
+
+  Crawly Render Server is a lightweight puppeteer based Javascript rendering
+  engine server. Quite experimental. See more:
+  https://github.com/elixir-crawly/crawly-render-server
+
+  It exposes /render endpoint that renders JS on incoming requests. For example:
+  curl -X POST \
+    http://localhost:3000/render \
+    -H 'Content-Type: application/json' \
+    -d '{
+       "url": "https://example.com",
+       "headers": {"User-Agent": "Custom User Agent"}
+  }'
+
+  In this case you have to configure the fetcher in the following way:
+  `fetcher: {Crawly.Fetchers.CrawlyRenderServer, [base_url: "http://localhost:3000/render"]}`
+  """
+  @behaviour Crawly.Fetchers.Fetcher
+
+  require Logger
+
+  def fetch(request, client_options) do
+    base_url =
+      case Keyword.get(client_options, :base_url, nil) do
+        nil ->
+          Logger.error(
+            "The base_url is not set. CrawlyRenderServer can't be used! " <>
+              "Please set :base_url in fetcher options to continue. " <>
+              "For example: " <>
+              "fetcher: {Crawly.Fetchers.CrawlyRenderServer, [base_url: <url>]}"
+          )
+
+          raise RuntimeError
+
+        base_url ->
+          base_url
+      end
+
+    req_body =
+      Poison.encode!(%{
+        url: request.url,
+        headers: Map.new(request.headers)
+      })
+
+    case HTTPoison.post(
+           base_url,
+           req_body,
+           [{"content-type", "application/json"}],
+           request.options
+         ) do
+      {:ok, response} ->
+        js = Poison.decode!(response.body)
+
+        new_response = %HTTPoison.Response{
+          body: Map.get(js, "page"),
+          status_code: Map.get(js, "status"),
+          headers: Map.get(js, "headers"),
+          request_url: request.url,
+          request: request
+        }
+
+        {:ok, new_response}
+
+      err ->
+        err
+    end
+  end
+end
diff --git a/test/fetchers/crawly_render_server_test.exs b/test/fetchers/crawly_render_server_test.exs
@@ -0,0 +1,38 @@
+defmodule Crawly.Fetchers.CrawlyRenderServerTest do
+  use ExUnit.Case
+  import Crawly.Fetchers.CrawlyRenderServer
+
+  test "throws an error when base_url is not set" do
+    request = %{
+      url: "https://example.com",
+      headers: %{"User-Agent" => "Custom User Agent"}
+    }
+
+    client_options = []
+
+    assert_raise RuntimeError, fn ->
+      fetch(request, client_options)
+    end
+  end
+
+  test "composes correct request to render server" do
+    request = %{
+      url: "https://example.com",
+      headers: [{"User-Agent", "Custom User Agent"}],
+      options: []
+    }
+
+    client_options = [base_url: "http://localhost:3000"]
+
+    :meck.expect(HTTPoison, :post, fn base_url, body, headers, _options ->
+      assert headers == [{"content-type", "application/json"}]
+      assert base_url == "http://localhost:3000"
+
+      body = Poison.decode!(body, %{keys: :atoms})
+      assert "https://example.com" == body.url
+      assert %{:"User-Agent" => "Custom User Agent"} == body.headers
+    end)
+
+    fetch(request, client_options)
+  end
+end