Skip to content

Commit

Permalink
benchmark: put downloaded sites into a configurable subdir
Browse files Browse the repository at this point in the history
  • Loading branch information
immerrr committed Mar 2, 2015
1 parent 2e5cce1 commit fe6752c
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 34 deletions.
4 changes: 2 additions & 2 deletions splash/benchmark/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ This directory contains a preliminary version of splash benchmark suite.
To use it, do the following:

- install ``httrack``
- create a directory for downloaded files, e.g. ``files``
- run ``python download_sites.py`` in that directory to download sites to be used in the benchmark
- run ``python download_sites.py``, it will create ``sites`` subdirectory in
current directory and download sites to be used in the benchmark there
- run ``python benchmark.py`` to run the benchmark

18 changes: 14 additions & 4 deletions splash/benchmark/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from multiprocessing.pool import ThreadPool
from pprint import pformat
from time import time
import re

import requests
from splash.benchmark.file_server import serve_files
Expand Down Expand Up @@ -65,10 +66,9 @@ def make_render_png_lua_req(splash, params):

#: Port at which static pages will be served.
PORT = 8806
#: Static pages to be used in the benchmark.
PAGES = glob('localhost_8806/*.html')
#: Combinations of width & height to test.
WIDTH_HEIGHT = [(None, None), (500, None), (None, 500), (500, 500)]
#: Splash log filename.
SPLASH_LOG = 'splash.log'
#: This script is used to collect maxrss & cpu time from splash process.
GET_PERF_STATS_SCRIPT = """
Expand All @@ -85,14 +85,24 @@ def make_render_png_lua_req(splash, params):
help='Request thread count')
parser.add_argument('--request-count', type=int, default=10,
help='Benchmark request count')
parser.add_argument('--sites-dir', type=str, default='sites',
help='Directory with downloaded sites')


def generate_requests(splash, args):
log = logging.getLogger('generate_requests')
log.info("Using pRNG seed: %s", args.seed)

# Static pages (relative to sites_dir) to be used in the benchmark.
pages = [re.sub('^%s/' % args.sites_dir, '', v)
for v in glob(os.path.join(args.sites_dir, 'localhost_8806',
'*.html'))]
for p in pages:
log.info("Using page for benchmark: %s", p)

rng = random.Random(args.seed)
for i in xrange(args.request_count):
page = rng.choice(PAGES)
page = rng.choice(pages)
width, height = rng.choice(WIDTH_HEIGHT)
req_factory = rng.choice(REQ_FACTORIES)
url = 'http://localhost:%d/%s' % (PORT, page)
Expand Down Expand Up @@ -140,7 +150,7 @@ def main():
'--disable-xvfb',
'--max-timeout=600'])

with splash, serve_files(PORT):
with splash, serve_files(PORT, args.sites_dir):
start_time = time()
results = parallel_map(invoke_request, generate_requests(splash, args),
args.thread_count)
Expand Down
62 changes: 39 additions & 23 deletions splash/benchmark/download_sites.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
#!/usr/bin/env python

"""
Site downloader script for Splash benchmark suite.
"""

from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
import errno
import json
import os
import re
import subprocess
from urlparse import urlsplit
Expand All @@ -9,7 +18,7 @@
from splash.benchmark.file_server import serve_files
from splash.tests.stress import lua_runonce

script_html = """
SCRIPT_HTML = """
function main(splash)
splash:set_images_enabled(false)
splash:go(splash.args.url)
Expand All @@ -18,24 +27,19 @@
end
"""

script_png = """
function main(splash)
splash:go(splash.args.url)
splash:wait(0.5)
return splash:png()
end
"""


#: This UA is used by httrack to mimic Splash requests when downloading sites.
USERAGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.34 (KHTML, like Gecko) Qt/4.8.1 Safari/534.34"


PORT = 8806

parser = ArgumentParser(description=__doc__,
formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument('--sites-dir', default='sites',
help='Directory for downloaded sites')

def preprocess_main_page(url):
out = json.loads(lua_runonce(script_html, url=url,

def preprocess_main_page(sites_dir, url):
out = json.loads(lua_runonce(SCRIPT_HTML, url=url,
splash_args=['--disable-lua-sandbox',
'--disable-xvfb',
'--max-timeout=600'],
Expand All @@ -56,13 +60,13 @@ def preprocess_main_page(url):
out['html'] = html.tostring(root, encoding='utf-8',
doctype='<!DOCTYPE html>')
filename = re.sub(r'[^\w]+', '_', url) + '.html'
with open(filename, 'w') as f:
with open(os.path.join(sites_dir, filename), 'w') as f:
f.write(out['html'])
return filename


def download_sites(sites):
local_files = [preprocess_main_page(s) for s in sites]
def download_sites(sites_dir, sites):
local_files = [preprocess_main_page(sites_dir, s) for s in sites]

local_urls = [
'http://localhost:%(port)d/%(filename)s' % {
Expand All @@ -75,12 +79,20 @@ def download_sites(sites):
'-%P', # Try parsing links in non-href/src sections
'-F', USERAGENT, # Emulate splash UA
'--depth=1']
subprocess.check_call(['httrack'] + args + local_urls)


if __name__ == '__main__':
with serve_files(PORT):
download_sites([
subprocess.check_call(['httrack'] + args + local_urls, cwd=sites_dir)


def main():
args = parser.parse_args()
try:
os.makedirs(args.sites_dir)
except OSError as e:
if e.errno != errno.EEXIST:
raise
elif not os.path.isdir(args.sites_dir):
raise RuntimeError("Not a directory: %s" % args.sites_dir)
with serve_files(PORT, args.sites_dir):
download_sites(args.sites_dir, [
'http://www.wikipedia.org',
'http://www.google.com',
'http://www.reddit.com',
Expand All @@ -89,3 +101,7 @@ def download_sites(sites):
# "http://blog.pinterest.com",
# "http://imgur.com",
])


if __name__ == '__main__':
main()
23 changes: 18 additions & 5 deletions splash/benchmark/file_server.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,10 +1,22 @@
#!/usr/bin/env python

"""
Simple static file server.
"""

import argparse
import os
import SimpleHTTPServer
import SocketServer
import subprocess
import sys
from contextlib import contextmanager


parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('port', type=int, help='Port number to listen at')
parser.add_argument('directory', type=str, help='Directory to serve')


class ReusingTCPServer(SocketServer.TCPServer):
allow_reuse_address = True

Expand All @@ -15,17 +27,18 @@ def address_string(self):


@contextmanager
def serve_files(port):
def serve_files(port, directory):
"""Serve files from current directory statically in a subprocess."""
site_server = subprocess.Popen(['python', '-m', __name__,
str(port)])
str(port), directory])
try:
yield
finally:
site_server.terminate()


if __name__ == '__main__':
port = int(sys.argv[1])
server = ReusingTCPServer(("", port), RequestHandler)
args = parser.parse_args()
os.chdir(args.directory)
server = ReusingTCPServer(("", args.port), RequestHandler)
server.serve_forever()

0 comments on commit fe6752c

Please sign in to comment.