From 2480ef079267a7f65d819b8e3538ec277cc32167 Mon Sep 17 00:00:00 2001 From: immerrr Date: Fri, 27 Feb 2015 21:10:27 +0300 Subject: [PATCH] Initial version of benchmark suite --- splash/benchmark/README.rst | 9 +++ splash/benchmark/benchmark.py | 80 ++++++++++++++++++++++++++ splash/benchmark/download_sites.py | 90 ++++++++++++++++++++++++++++++ splash/benchmark/file_server.py | 31 ++++++++++ 4 files changed, 210 insertions(+) create mode 100644 splash/benchmark/README.rst create mode 100755 splash/benchmark/benchmark.py create mode 100644 splash/benchmark/download_sites.py create mode 100644 splash/benchmark/file_server.py diff --git a/splash/benchmark/README.rst b/splash/benchmark/README.rst new file mode 100644 index 000000000..10e3b3e23 --- /dev/null +++ b/splash/benchmark/README.rst @@ -0,0 +1,9 @@ +This directory contains a preliminary version of splash benchmark suite. + +To use it, do the following: + +- install ``httrack`` +- create a directory for downloaded files, e.g. ``files`` +- run ``python download_sites.py`` in that directory to download sites to be used in the benchmark +- run ``python benchmark.py`` to run the benchmark + diff --git a/splash/benchmark/benchmark.py b/splash/benchmark/benchmark.py new file mode 100755 index 000000000..44201c1c7 --- /dev/null +++ b/splash/benchmark/benchmark.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python + +""" +Splash benchmark script. + +It takes a directory downloaded with splash & httrack, fires up a static file +server and runs a series of requests via splash on those downloaded pages. + +""" + +import logging +import random +from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser +from glob import glob +from multiprocessing.pool import ThreadPool + +import requests +from splash.file_server import serve_files +from splash.tests.utils import SplashServer + +PORT = 8806 +#: URLs to benchmark against. +PAGES = glob('localhost_8806/*.html') +#: Combinations of width & height to test. +WIDTH_HEIGHT = [(None, None), (500, None), (None, 500), (500, 500)] +# XXX: add benchmark of different API endpoints. +SPLASH_LOG = 'splash.log' + +parser = ArgumentParser(description=__doc__, + formatter_class=ArgumentDefaultsHelpFormatter) +parser.add_argument('--seed', type=int, default=1234, help='PRNG seed number') +parser.add_argument('--thread-count', type=int, default=1, + help='Request thread count') +parser.add_argument('--request-count', type=int, default=10, + help='Benchmark request count') + + +def generate_requests(splash, args): + log = logging.getLogger('generate_requests') + log.info("Using pRNG seed: %s", args.seed) + rng = random.Random(args.seed) + for i in xrange(args.request_count): + page = rng.choice(PAGES) + width, height = rng.choice(WIDTH_HEIGHT) + url = 'http://localhost:%d/%s' % (PORT, page) + yield (i + 1, args.request_count, + {'url': splash.url('render.png'), + 'params': {'url': url, 'width': width, 'height': height}}) + + +def parallel_map(func, iterable, thread_count): + if thread_count == 1: + return map(func, iterable) + else: + pool = ThreadPool(thread_count) + return pool.map(func, iterable) + + +def invoke_request(invoke_args): + log = logging.getLogger('bench_worker') + req_no, total_reqs, kwargs = invoke_args + log.info("Initiating request %d/%d: %s", req_no, total_reqs, kwargs) + return requests.get(**kwargs) + + +def main(): + args = parser.parse_args() + logging.basicConfig(level=logging.DEBUG) + + with SplashServer(logfile=SPLASH_LOG, + extra_args=['--disable-lua-sandbox', + '--disable-xvfb', + '--max-timeout=600']) as splash, \ + serve_files(PORT): + parallel_map(invoke_request, generate_requests(splash, args), + args.thread_count) + + +if __name__ == '__main__': + main() diff --git a/splash/benchmark/download_sites.py b/splash/benchmark/download_sites.py new file mode 100644 index 000000000..0d0bf4b0f --- /dev/null +++ b/splash/benchmark/download_sites.py @@ -0,0 +1,90 @@ +from splash.tests.stress import lua_runonce + +import re +from urlparse import urlsplit +import json +from lxml import html +import w3lib.html +import subprocess +from splash.file_server import serve_files + +script_html = """ +function main(splash) +splash:set_images_enabled(false) +splash:go(splash.args.url) +splash:wait(0.5) +return {url=splash:url(), html=splash:html()} +end +""" + +script_png = """ + +function main(splash) +splash:go(splash.args.url) +splash:wait(0.5) +return splash:png() +end +""" + + +USERAGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.34 (KHTML, like Gecko) Qt/4.8.1 Safari/534.34" + + +PORT = 8806 + + +def preprocess_main_page(url): + out = json.loads(lua_runonce(script_html, url=url, + splash_args=['--disable-lua-sandbox', + '--disable-xvfb', + '--max-timeout=600'], + timeout=600.,)) + final_url = urlsplit(out['url'])._replace(query='', fragment='').geturl() + if not w3lib.html.get_base_url(out['html']): + out['html'] = w3lib.html.remove_tags_with_content( + out['html'], ('script',)) + root = html.fromstring(out['html'], parser=html.HTMLParser(), + base_url=final_url) + try: + head = root.xpath('./head')[0] + except IndexError: + head = html.Element('head') + root.insert(0, head) + head.insert(0, html.Element('base', {'href': final_url})) + head.insert(0, html.Element('meta', {'charset': 'utf-8'})) + out['html'] = html.tostring(root, encoding='utf-8', + doctype='') + filename = re.sub(r'[^\w]+', '_', url) + '.html' + with open(filename, 'w') as f: + f.write(out['html']) + return filename + + +def download_sites(sites): + local_files = [preprocess_main_page(s) for s in sites] + + local_urls = [ + 'http://localhost:%(port)d/%(filename)s' % { + 'port': PORT, 'filename': filename + } + for filename in local_files + ] + args = ['--continue', + '--near', # Fetch referred non-html files. + '-%P', # Try parsing links in non-href/src sections + '-F', USERAGENT, # Emulate splash UA + '--depth=1'] + subprocess.check_call(['httrack'] + args + local_urls) + + +if __name__ == '__main__': + with serve_files(PORT): + download_sites([ + 'http://www.wikipedia.org', + 'http://www.google.com', + 'http://www.reddit.com', + "http://w3.org", + "http://w3.org/TR/2010/REC-xhtml-basic-20101123/", + # "http://blog.pinterest.com", + # "http://imgur.com", + ]) diff --git a/splash/benchmark/file_server.py b/splash/benchmark/file_server.py new file mode 100644 index 000000000..77e2b7084 --- /dev/null +++ b/splash/benchmark/file_server.py @@ -0,0 +1,31 @@ +import SimpleHTTPServer +import SocketServer +import subprocess +import sys +from contextlib import contextmanager + + +class ReusingTCPServer(SocketServer.TCPServer): + allow_reuse_address = True + + +class RequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler): + def address_string(self): + return "fileserver" + + +@contextmanager +def serve_files(port): + """Serve files from current directory statically in a subprocess.""" + site_server = subprocess.Popen(['python', '-m', __name__, + str(port)]) + try: + yield + finally: + site_server.terminate() + + +if __name__ == '__main__': + port = int(sys.argv[1]) + server = ReusingTCPServer(("", port), RequestHandler) + server.serve_forever()