Initial version of benchmark suite

scrapinghub · Feb 27, 2015 · 2480ef0 · 2480ef0
1 parent a6b1371
commit 2480ef0
Show file tree

Hide file tree

Showing 4 changed files with 210 additions and 0 deletions.
diff --git a/splash/benchmark/README.rst b/splash/benchmark/README.rst
@@ -0,0 +1,9 @@
+This directory contains a preliminary version of splash benchmark suite.
+
+To use it, do the following:
+
+- install ``httrack``
+- create a directory for downloaded files, e.g. ``files``
+- run ``python download_sites.py`` in that directory to download sites to be used in the benchmark
+- run ``python benchmark.py`` to run the benchmark
+
diff --git a/splash/benchmark/benchmark.py b/splash/benchmark/benchmark.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python
+
+"""
+Splash benchmark script.
+
+It takes a directory downloaded with splash & httrack, fires up a static file
+server and runs a series of requests via splash on those downloaded pages.
+
+"""
+
+import logging
+import random
+from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
+from glob import glob
+from multiprocessing.pool import ThreadPool
+
+import requests
+from splash.file_server import serve_files
+from splash.tests.utils import SplashServer
+
+PORT = 8806
+#: URLs to benchmark against.
+PAGES = glob('localhost_8806/*.html')
+#: Combinations of width & height to test.
+WIDTH_HEIGHT = [(None, None), (500, None), (None, 500), (500, 500)]
+# XXX: add benchmark of different API endpoints.
+SPLASH_LOG = 'splash.log'
+
+parser = ArgumentParser(description=__doc__,
+                        formatter_class=ArgumentDefaultsHelpFormatter)
+parser.add_argument('--seed', type=int, default=1234, help='PRNG seed number')
+parser.add_argument('--thread-count', type=int, default=1,
+                    help='Request thread count')
+parser.add_argument('--request-count', type=int, default=10,
+                    help='Benchmark request count')
+
+
+def generate_requests(splash, args):
+    log = logging.getLogger('generate_requests')
+    log.info("Using pRNG seed: %s", args.seed)
+    rng = random.Random(args.seed)
+    for i in xrange(args.request_count):
+        page = rng.choice(PAGES)
+        width, height = rng.choice(WIDTH_HEIGHT)
+        url = 'http://localhost:%d/%s' % (PORT, page)
+        yield (i + 1, args.request_count,
+               {'url': splash.url('render.png'),
+                'params': {'url': url, 'width': width, 'height': height}})
+
+
+def parallel_map(func, iterable, thread_count):
+    if thread_count == 1:
+        return map(func, iterable)
+    else:
+        pool = ThreadPool(thread_count)
+        return pool.map(func, iterable)
+
+
+def invoke_request(invoke_args):
+    log = logging.getLogger('bench_worker')
+    req_no, total_reqs, kwargs = invoke_args
+    log.info("Initiating request %d/%d: %s", req_no, total_reqs, kwargs)
+    return requests.get(**kwargs)
+
+
+def main():
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.DEBUG)
+
+    with SplashServer(logfile=SPLASH_LOG,
+                      extra_args=['--disable-lua-sandbox',
+                                  '--disable-xvfb',
+                                  '--max-timeout=600']) as splash, \
+         serve_files(PORT):
+        parallel_map(invoke_request, generate_requests(splash, args),
+                     args.thread_count)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/splash/benchmark/download_sites.py b/splash/benchmark/download_sites.py
@@ -0,0 +1,90 @@
+from splash.tests.stress import lua_runonce
+
+import re
+from urlparse import urlsplit
+import json
+from lxml import html
+import w3lib.html
+import subprocess
+from splash.file_server import serve_files
+
+script_html = """
+function main(splash)
+splash:set_images_enabled(false)
+splash:go(splash.args.url)
+splash:wait(0.5)
+return {url=splash:url(), html=splash:html()}
+end
+"""
+
+script_png = """
+
+function main(splash)
+splash:go(splash.args.url)
+splash:wait(0.5)
+return splash:png()
+end
+"""
+
+
+USERAGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.34 (KHTML, like Gecko) Qt/4.8.1 Safari/534.34"
+
+
+PORT = 8806
+
+
+def preprocess_main_page(url):
+    out = json.loads(lua_runonce(script_html, url=url,
+                                 splash_args=['--disable-lua-sandbox',
+                                              '--disable-xvfb',
+                                              '--max-timeout=600'],
+                                 timeout=600.,))
+    final_url = urlsplit(out['url'])._replace(query='', fragment='').geturl()
+    if not w3lib.html.get_base_url(out['html']):
+        out['html'] = w3lib.html.remove_tags_with_content(
+            out['html'], ('script',))
+        root = html.fromstring(out['html'], parser=html.HTMLParser(),
+                               base_url=final_url)
+        try:
+            head = root.xpath('./head')[0]
+        except IndexError:
+            head = html.Element('head')
+            root.insert(0, head)
+        head.insert(0, html.Element('base', {'href': final_url}))
+        head.insert(0, html.Element('meta', {'charset': 'utf-8'}))
+        out['html'] = html.tostring(root, encoding='utf-8',
+                                    doctype='<!DOCTYPE html>')
+    filename = re.sub(r'[^\w]+', '_', url) + '.html'
+    with open(filename, 'w') as f:
+        f.write(out['html'])
+    return filename
+
+
+def download_sites(sites):
+    local_files = [preprocess_main_page(s) for s in sites]
+
+    local_urls = [
+        'http://localhost:%(port)d/%(filename)s' % {
+            'port': PORT, 'filename': filename
+        }
+        for filename in local_files
+    ]
+    args = ['--continue',
+            '--near',           # Fetch referred non-html files.
+            '-%P',              # Try parsing links in non-href/src sections
+            '-F', USERAGENT,    # Emulate splash UA
+            '--depth=1']
+    subprocess.check_call(['httrack'] + args + local_urls)
+
+
+if __name__ == '__main__':
+    with serve_files(PORT):
+        download_sites([
+            'http://www.wikipedia.org',
+            'http://www.google.com',
+            'http://www.reddit.com',
+            "http://w3.org",
+            "http://w3.org/TR/2010/REC-xhtml-basic-20101123/",
+            # "http://blog.pinterest.com",
+            # "http://imgur.com",
+        ])
diff --git a/splash/benchmark/file_server.py b/splash/benchmark/file_server.py
@@ -0,0 +1,31 @@
+import SimpleHTTPServer
+import SocketServer
+import subprocess
+import sys
+from contextlib import contextmanager
+
+
+class ReusingTCPServer(SocketServer.TCPServer):
+    allow_reuse_address = True
+
+
+class RequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
+    def address_string(self):
+        return "fileserver"
+
+
+@contextmanager
+def serve_files(port):
+    """Serve files from current directory statically in a subprocess."""
+    site_server = subprocess.Popen(['python', '-m', __name__,
+                                    str(port)])
+    try:
+        yield
+    finally:
+        site_server.terminate()
+
+
+if __name__ == '__main__':
+    port = int(sys.argv[1])
+    server = ReusingTCPServer(("", port), RequestHandler)
+    server.serve_forever()