diff --git a/splash/benchmark/README.rst b/splash/benchmark/README.rst new file mode 100644 index 000000000..39a70688e --- /dev/null +++ b/splash/benchmark/README.rst @@ -0,0 +1,9 @@ +This directory contains a preliminary version of splash benchmark suite. + +To use it, do the following: + +- install ``httrack`` +- run ``python download_sites.py``, it will create ``sites`` subdirectory in + current directory and download sites to be used in the benchmark there +- run ``python benchmark.py`` to run the benchmark + diff --git a/splash/benchmark/__init__.py b/splash/benchmark/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/splash/benchmark/benchmark.py b/splash/benchmark/benchmark.py new file mode 100755 index 000000000..ca22db518 --- /dev/null +++ b/splash/benchmark/benchmark.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python + +""" +Splash benchmark script. + +It takes a directory downloaded with splash & httrack, fires up a static file +server and runs a series of requests via splash on those downloaded pages. + +""" + +import json +import logging +import os +import random +from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, FileType +from glob import glob +from multiprocessing.pool import ThreadPool +from pprint import pformat +from time import time +import re +import sys + +import requests + + +def make_render_png_req(splash, params): + """Make PNG render request via render.png endpoint.""" + return {'url': splash.url('render.png'), + 'params': params} + + +def make_render_json_req(splash, params): + """Make PNG render request via JSON endpoint.""" + json_params = params.copy() + json_params['png'] = 1 + return {'url': splash.url('render.json'), + 'params': json_params} + + +def make_render_png_lua_req(splash, params): + """Make PNG render request via Lua execute endpoint.""" + lua_params = params.copy() + lua_params['lua_source'] = """ +function main(splash) + assert(splash:go(splash.args.url)) + if splash.args.wait then + assert(splash:wait(splash.args.wait)) + end + splash:set_result_content_type("image/png") + return splash:png{width=splash.args.width, + height=splash.args.height, + render_all=splash.args.render_all} +end +""" + return {'url': splash.url('execute'), + 'params': lua_params} + + +def make_render_html_req(splash, params): + """Make HTML render request via render.html endpoint.""" + return {'url': splash.url('render.html'), + 'params': params} + + +def make_render_html_json_req(splash, params): + """Make HTML render request via JSON endpoint.""" + json_params = params.copy() + json_params['html'] = 1 + return {'url': splash.url('render.json'), + 'params': json_params} + + +def make_render_html_lua_req(splash, params): + """Make HTML render request via Lua execute endpoint.""" + lua_params = params.copy() + lua_params['lua_source'] = """ +function main(splash) + assert(splash:go(splash.args.url)) + if splash.args.wait then + assert(splash:wait(splash.args.wait)) + end + splash:set_result_content_type("text/html; charset=UTF-8") + return splash:html{} +end +""" + return {'url': splash.url('execute'), + 'params': lua_params} + + +#: Same resource may be rendered by various endpoints with slightly varying +#: parameter combinations. Request factories set those combinations up. +REQ_FACTORIES = { + 'png': [ + make_render_png_req, + make_render_json_req, + make_render_png_lua_req, + ], + 'html': [ + make_render_html_req, + make_render_html_json_req, + make_render_html_lua_req, + ], +} + + +#: Port at which static pages will be served. +PORT = 8806 +#: Combinations of width & height to test. +WIDTH_HEIGHT = [(None, None), (500, None), (None, 500), (500, 500)] +#: Splash & fileserver log filenames (set to None to put it to stderr). +SPLASH_LOG = 'splash.log' +FILESERVER_LOG = 'fileserver.log' +#: This script is used to collect maxrss & cpu time from splash process. +GET_PERF_STATS_SCRIPT = """ +function main(splash) + return splash:get_perf_stats() +end +""" + + +parser = ArgumentParser(description=__doc__, + formatter_class=ArgumentDefaultsHelpFormatter) +parser.add_argument('--seed', type=int, default=1234, help='PRNG seed number') +parser.add_argument('--thread-count', type=int, default=1, + help='Request thread count') +parser.add_argument('--request-count', type=int, default=10, + help='Benchmark request count') +parser.add_argument('--sites-dir', type=str, default='sites', required=True, + help='Directory with downloaded sites') +parser.add_argument('--file-server', metavar='HOST:PORT', + help='Use existing file server instance available at HOST:PORT') +parser.add_argument('--splash-server', metavar='HOST:PORT', + help='Use existing Splash instance available at HOST:PORT') +parser.add_argument('--out-file', type=FileType(mode='w'), default=sys.stdout, + help='Write detailed request information in this file') +parser.add_argument('--render-type', choices=('html', 'png'), default='png', + help=('Type of rendering to benchmark' + ' (either "html" or "png")')) + + +def generate_requests(splash, file_server, args): + log = logging.getLogger('generate_requests') + log.info("Using pRNG seed: %s", args.seed) + + # Static pages (relative to sites_dir) to be used in the benchmark. + log.info("sites dir: %s", args.sites_dir) + sites_found = glob(os.path.join(args.sites_dir, 'localhost_8806', '*.html')) + log.info("sites found: %s", sites_found) + pages = [re.sub('^%s/' % args.sites_dir.rstrip('/'), '', v) for v in sites_found] + for p in pages: + log.info("Using page for benchmark: %s", p) + + request_factories = REQ_FACTORIES[args.render_type] + + rng = random.Random(args.seed) + for i in xrange(args.request_count): + page = rng.choice(pages) + width, height = rng.choice(WIDTH_HEIGHT) + req_factory = rng.choice(request_factories) + url = file_server.url(page) + params = {'url': url, 'render_all': 1, 'wait': 0.1, + 'width': width, 'height': height} + log.debug("Req factory: %s, params: %s", req_factory, params) + yield (i + 1, args.request_count, req_factory(splash, params)) + + +def parallel_map(func, iterable, thread_count): + if thread_count == 1: + return map(func, iterable) + else: + pool = ThreadPool(thread_count) + return pool.map(func, iterable) + + +def invoke_request(invoke_args): + log = logging.getLogger('bench_worker') + req_no, total_reqs, kwargs = invoke_args + log.info("Initiating request %d/%d: %s", req_no, total_reqs, kwargs) + stime = time() + response = requests.get(**kwargs) + etime = time() + if response.status_code != 200: + log.error("Non-OK response:\n%s", response.text) + return {'start_time': stime, + 'end_time': etime, + 'duration': etime - stime, + 'endpoint': kwargs['url'], + 'status': response.status_code, + 'site': kwargs['params']['url'], + 'width': kwargs['params']['width'], + 'height': kwargs['params']['height']} + + +class ExistingServerWrapper(object): + """Wrapper for pre-existing Splash instance.""" + def __init__(self, server): + self.server = server + if not self.server.startswith('http://'): + self.server = 'http://' + self.server + + def url(self, endpoint): + return self.server + '/' + endpoint + + def __enter__(self): + return self + + def __exit__(self, *args): + pass + + +def main(): + log = logging.getLogger("benchmark") + args = parser.parse_args() + (logging.getLogger('requests.packages.urllib3.connectionpool') + .setLevel(logging.WARNING)) + logging.basicConfig(level=logging.DEBUG) + + if args.splash_server: + splash = ExistingServerWrapper(args.splash_server) + else: + from splash.tests.utils import SplashServer + splash = SplashServer( + logfile=SPLASH_LOG, + extra_args=['--disable-lua-sandbox', + '--disable-xvfb', + '--max-timeout=600']) + + if args.file_server: + file_server = ExistingServerWrapper(args.file_server) + else: + from splash.benchmark.file_server import FileServerSubprocess + file_server = FileServerSubprocess(port=PORT, + path=args.sites_dir, + logfile=FILESERVER_LOG) + + with splash, file_server: + log.info("Servers are up, starting benchmark...") + start_res = requests.get( + splash.url('execute'), + params={'lua_source': GET_PERF_STATS_SCRIPT}).json() + start_time = time() + results = parallel_map(invoke_request, + generate_requests(splash, file_server, args), + args.thread_count) + end_time = time() + end_res = requests.get( + splash.url('execute'), + params={'lua_source': GET_PERF_STATS_SCRIPT}).json() + + log.info("Writing stats to %s", args.out_file.name) + args.out_file.write(json.dumps( + {'maxrss': end_res['maxrss'], + 'cputime': end_res['cputime'] - start_res['cputime'], + 'walltime': end_time - start_time, + 'requests': results}, + indent=2)) + log.info("Splash max RSS: %s B", end_res['maxrss']) + log.info("Splash CPU time elapsed: %.2f sec", + end_res['cputime'] - start_res['cputime']) + log.info("Wallclock time elapsed: %.2f sec", end_time - start_time) + + +if __name__ == '__main__': + main() diff --git a/splash/benchmark/download_sites.py b/splash/benchmark/download_sites.py new file mode 100755 index 000000000..07a9577de --- /dev/null +++ b/splash/benchmark/download_sites.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python + +""" +Site downloader script for Splash benchmark suite. +""" + +from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter +import errno +import json +import os +import re +import subprocess +import logging +from urlparse import urlsplit + +from lxml import html + +import w3lib.html +from splash.benchmark.file_server import FileServerSubprocess +from splash.tests.stress import lua_runonce + +SCRIPT_HTML = """ +function main(splash) +splash:set_images_enabled(false) +splash:go(splash.args.url) +splash:wait(0.5) +return {url=splash:url(), html=splash:html()} +end +""" + +#: This UA is used by httrack to mimic Splash requests when downloading sites. +USERAGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.34 (KHTML, like Gecko) Qt/4.8.1 Safari/534.34" + +PORT = 8806 + +parser = ArgumentParser(description=__doc__, + formatter_class=ArgumentDefaultsHelpFormatter) +parser.add_argument('--sites-dir', default='sites', + help='Directory for downloaded sites') + + +def preprocess_main_page(sites_dir, url): + """ + This function does several things: + - strip javascript so that downloaded pages look exactly the same + - add baseurl to resolve relative links properly (if it is missing) + - add meta charset description (if it is missing) + """ + out = json.loads(lua_runonce(SCRIPT_HTML, url=url, + splash_args=['--disable-lua-sandbox', + '--disable-xvfb', + '--max-timeout=600'], + timeout=600.,)) + final_url = urlsplit(out['url'])._replace(query='', fragment='').geturl() + # Ensure there are no scripts to be executed. + out['html'] = w3lib.html.remove_tags_with_content(out['html'], ('script',)) + root = html.fromstring(out['html'], parser=html.HTMLParser(), + base_url=final_url) + try: + head = root.xpath('./head')[0] + except IndexError: + head = html.Element('head') + root.insert(0, head) + if not head.xpath('./base/@href'): + head.insert(0, html.Element('base', {'href': final_url})) + if not head.xpath('./meta/@charset'): + head.insert(0, html.Element('meta', {'charset': 'utf-8'})) + out['html'] = html.tostring(root, encoding='utf-8', + doctype='') + filename = re.sub(r'[^\w]+', '_', url) + '.html' + with open(os.path.join(sites_dir, filename), 'w') as f: + f.write(out['html']) + return filename + + +def download_sites(sites_dir, sites): + local_files = [preprocess_main_page(sites_dir, s) for s in sites] + + local_urls = [ + 'http://localhost:%(port)d/%(filename)s' % { + 'port': PORT, 'filename': filename + } + for filename in local_files + ] + args = ['--continue', + '--near', # Fetch referred non-html files. + '-%P', # Try parsing links in non-href/src sections + '-F', USERAGENT, # Emulate splash UA + '--depth=1'] + subprocess.check_call(['httrack'] + args + local_urls, cwd=sites_dir) + + +def main(): + args = parser.parse_args() + (logging.getLogger('requests.packages.urllib3.connectionpool') + .setLevel(logging.WARNING)) + logging.basicConfig(level=logging.DEBUG) + logging.info("Starting site download suite") + try: + os.makedirs(args.sites_dir) + except OSError as e: + if e.errno != errno.EEXIST: + raise + elif not os.path.isdir(args.sites_dir): + raise RuntimeError("Not a directory: %s" % args.sites_dir) + with FileServerSubprocess(port=PORT, path=args.sites_dir): + download_sites(args.sites_dir, [ + 'http://www.wikipedia.org', + 'http://www.google.com', + 'http://www.reddit.com', + "http://w3.org", + "http://w3.org/TR/2010/REC-xhtml-basic-20101123/", + # "http://blog.pinterest.com", + # "http://imgur.com", + ]) + + +if __name__ == '__main__': + main() diff --git a/splash/benchmark/file_server.py b/splash/benchmark/file_server.py new file mode 100755 index 000000000..bb0549ce8 --- /dev/null +++ b/splash/benchmark/file_server.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python + +"""Simple static file server.""" + +import argparse +import os +import subprocess +import time +import sys +import logging +from contextlib import contextmanager + +from twisted.internet import reactor +from twisted.web.server import Site +from twisted.web.static import File +from twisted.python.log import startLogging + +import requests + +parser = argparse.ArgumentParser("") +parser.add_argument('--port', type=int, default=8806) +parser.add_argument('--path', help='Path to be served', default='.') +parser.add_argument('--logfile', default=sys.stderr, + type=argparse.FileType(mode='w'), + help='File to write logs to') + + +class FileServerSubprocess(object): + logger = logging.getLogger('file_server') + + """Serve files from specified directory statically in a subprocess.""" + def __init__(self, port, path, logfile=None): + self.port = port + self.path = path + self.logfile = logfile + self.server = 'http://localhost:%d' % port + + def url(self, endpoint): + return self.server + '/' + endpoint + + def __enter__(self): + # command = ['twistd', + # '-n', # don't daemonize + # 'web', # start web component + # '--port', str(int(port)), + # '--path', os.path.abspath(directory), ] + # if logfile is not None: + # command += ['--logfile', logfile] + command = ['python', __file__, + '--port', str(int(self.port)), + '--path', os.path.abspath(self.path)] + if self.logfile is not None: + command += ['--logfile', self.logfile] + self.logger.info("Starting file server subprocess: %s", command) + self._site_server = subprocess.Popen(command) + # It might take some time to bring up the server, wait for up to 10s. + for i in xrange(100): + try: + self.logger.info("Checking if file server is active") + requests.get(self.url('')) + break + except requests.ConnectionError: + time.sleep(0.1) + else: + msg = "File server subprocess startup timed out" + if self.logfile: + with open(self.logfile, 'r') as log_f: + msg += ", logs:\n" + log_f.read() + raise RuntimeError(msg) + + def __exit__(self, *args): + self._site_server.kill() + self._site_server.wait() + + +def main(): + args = parser.parse_args() + startLogging(args.logfile) + resource = File(os.path.abspath(args.path)) + site = Site(resource) + reactor.listenTCP(args.port, site) + reactor.run() + + +if __name__ == '__main__': + main() diff --git a/splash/conftest.py b/splash/conftest.py index 2a20cc38a..f93ed2990 100644 --- a/splash/conftest.py +++ b/splash/conftest.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import from splash import lua +import glob collect_ignore = [] @@ -15,3 +16,8 @@ 'kernel/__main__.py', 'kernel/__init__.py', ] + +collect_ignore.extend([ + 'benchmark/download_sites.py', + 'benchmark/file_server.py', + 'benchmark/benchmark.py']) diff --git a/splash/qtrender_lua.py b/splash/qtrender_lua.py index 82f6f8fe5..0e12b8503 100644 --- a/splash/qtrender_lua.py +++ b/splash/qtrender_lua.py @@ -497,7 +497,7 @@ def get_perf_stats(self): rss_mul = 1 if sys.platform == 'darwin' else 1024 return {'maxrss': rusage.ru_maxrss * rss_mul, 'cputime': rusage.ru_utime + rusage.ru_stime, - 'walltime': time.time()} + 'walltime': time.time()} def get_real_exception(self): if self._exceptions: