diff --git a/splash/benchmark/README.rst b/splash/benchmark/README.rst index 10e3b3e23..39a70688e 100644 --- a/splash/benchmark/README.rst +++ b/splash/benchmark/README.rst @@ -3,7 +3,7 @@ This directory contains a preliminary version of splash benchmark suite. To use it, do the following: - install ``httrack`` -- create a directory for downloaded files, e.g. ``files`` -- run ``python download_sites.py`` in that directory to download sites to be used in the benchmark +- run ``python download_sites.py``, it will create ``sites`` subdirectory in + current directory and download sites to be used in the benchmark there - run ``python benchmark.py`` to run the benchmark diff --git a/splash/benchmark/benchmark.py b/splash/benchmark/benchmark.py index b9c501713..3a62e11bf 100755 --- a/splash/benchmark/benchmark.py +++ b/splash/benchmark/benchmark.py @@ -17,6 +17,7 @@ from multiprocessing.pool import ThreadPool from pprint import pformat from time import time +import re import requests from splash.benchmark.file_server import serve_files @@ -65,10 +66,9 @@ def make_render_png_lua_req(splash, params): #: Port at which static pages will be served. PORT = 8806 -#: Static pages to be used in the benchmark. -PAGES = glob('localhost_8806/*.html') #: Combinations of width & height to test. WIDTH_HEIGHT = [(None, None), (500, None), (None, 500), (500, 500)] +#: Splash log filename. SPLASH_LOG = 'splash.log' #: This script is used to collect maxrss & cpu time from splash process. GET_PERF_STATS_SCRIPT = """ @@ -85,14 +85,24 @@ def make_render_png_lua_req(splash, params): help='Request thread count') parser.add_argument('--request-count', type=int, default=10, help='Benchmark request count') +parser.add_argument('--sites-dir', type=str, default='sites', + help='Directory with downloaded sites') def generate_requests(splash, args): log = logging.getLogger('generate_requests') log.info("Using pRNG seed: %s", args.seed) + + # Static pages (relative to sites_dir) to be used in the benchmark. + pages = [re.sub('^%s/' % args.sites_dir, '', v) + for v in glob(os.path.join(args.sites_dir, 'localhost_8806', + '*.html'))] + for p in pages: + log.info("Using page for benchmark: %s", p) + rng = random.Random(args.seed) for i in xrange(args.request_count): - page = rng.choice(PAGES) + page = rng.choice(pages) width, height = rng.choice(WIDTH_HEIGHT) req_factory = rng.choice(REQ_FACTORIES) url = 'http://localhost:%d/%s' % (PORT, page) @@ -140,7 +150,7 @@ def main(): '--disable-xvfb', '--max-timeout=600']) - with splash, serve_files(PORT): + with splash, serve_files(PORT, args.sites_dir): start_time = time() results = parallel_map(invoke_request, generate_requests(splash, args), args.thread_count) diff --git a/splash/benchmark/download_sites.py b/splash/benchmark/download_sites.py old mode 100644 new mode 100755 index abe921eee..482469bab --- a/splash/benchmark/download_sites.py +++ b/splash/benchmark/download_sites.py @@ -1,4 +1,13 @@ +#!/usr/bin/env python + +""" +Site downloader script for Splash benchmark suite. +""" + +from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter +import errno import json +import os import re import subprocess from urlparse import urlsplit @@ -9,7 +18,7 @@ from splash.benchmark.file_server import serve_files from splash.tests.stress import lua_runonce -script_html = """ +SCRIPT_HTML = """ function main(splash) splash:set_images_enabled(false) splash:go(splash.args.url) @@ -18,24 +27,19 @@ end """ -script_png = """ - -function main(splash) -splash:go(splash.args.url) -splash:wait(0.5) -return splash:png() -end -""" - - +#: This UA is used by httrack to mimic Splash requests when downloading sites. USERAGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.34 (KHTML, like Gecko) Qt/4.8.1 Safari/534.34" - PORT = 8806 +parser = ArgumentParser(description=__doc__, + formatter_class=ArgumentDefaultsHelpFormatter) +parser.add_argument('--sites-dir', default='sites', + help='Directory for downloaded sites') -def preprocess_main_page(url): - out = json.loads(lua_runonce(script_html, url=url, + +def preprocess_main_page(sites_dir, url): + out = json.loads(lua_runonce(SCRIPT_HTML, url=url, splash_args=['--disable-lua-sandbox', '--disable-xvfb', '--max-timeout=600'], @@ -56,13 +60,13 @@ def preprocess_main_page(url): out['html'] = html.tostring(root, encoding='utf-8', doctype='') filename = re.sub(r'[^\w]+', '_', url) + '.html' - with open(filename, 'w') as f: + with open(os.path.join(sites_dir, filename), 'w') as f: f.write(out['html']) return filename -def download_sites(sites): - local_files = [preprocess_main_page(s) for s in sites] +def download_sites(sites_dir, sites): + local_files = [preprocess_main_page(sites_dir, s) for s in sites] local_urls = [ 'http://localhost:%(port)d/%(filename)s' % { @@ -75,12 +79,20 @@ def download_sites(sites): '-%P', # Try parsing links in non-href/src sections '-F', USERAGENT, # Emulate splash UA '--depth=1'] - subprocess.check_call(['httrack'] + args + local_urls) - - -if __name__ == '__main__': - with serve_files(PORT): - download_sites([ + subprocess.check_call(['httrack'] + args + local_urls, cwd=sites_dir) + + +def main(): + args = parser.parse_args() + try: + os.makedirs(args.sites_dir) + except OSError as e: + if e.errno != errno.EEXIST: + raise + elif not os.path.isdir(args.sites_dir): + raise RuntimeError("Not a directory: %s" % args.sites_dir) + with serve_files(PORT, args.sites_dir): + download_sites(args.sites_dir, [ 'http://www.wikipedia.org', 'http://www.google.com', 'http://www.reddit.com', @@ -89,3 +101,7 @@ def download_sites(sites): # "http://blog.pinterest.com", # "http://imgur.com", ]) + + +if __name__ == '__main__': + main() diff --git a/splash/benchmark/file_server.py b/splash/benchmark/file_server.py old mode 100644 new mode 100755 index 77e2b7084..2931f41ae --- a/splash/benchmark/file_server.py +++ b/splash/benchmark/file_server.py @@ -1,10 +1,22 @@ +#!/usr/bin/env python + +""" +Simple static file server. +""" + +import argparse +import os import SimpleHTTPServer import SocketServer import subprocess -import sys from contextlib import contextmanager +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument('port', type=int, help='Port number to listen at') +parser.add_argument('directory', type=str, help='Directory to serve') + + class ReusingTCPServer(SocketServer.TCPServer): allow_reuse_address = True @@ -15,10 +27,10 @@ def address_string(self): @contextmanager -def serve_files(port): +def serve_files(port, directory): """Serve files from current directory statically in a subprocess.""" site_server = subprocess.Popen(['python', '-m', __name__, - str(port)]) + str(port), directory]) try: yield finally: @@ -26,6 +38,7 @@ def serve_files(port): if __name__ == '__main__': - port = int(sys.argv[1]) - server = ReusingTCPServer(("", port), RequestHandler) + args = parser.parse_args() + os.chdir(args.directory) + server = ReusingTCPServer(("", args.port), RequestHandler) server.serve_forever()