Skip to content

Commit

Permalink
Initial version of benchmark suite
Browse files Browse the repository at this point in the history
  • Loading branch information
immerrr committed Feb 27, 2015
1 parent a6b1371 commit 2480ef0
Show file tree
Hide file tree
Showing 4 changed files with 210 additions and 0 deletions.
9 changes: 9 additions & 0 deletions splash/benchmark/README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
This directory contains a preliminary version of splash benchmark suite.

To use it, do the following:

- install ``httrack``
- create a directory for downloaded files, e.g. ``files``
- run ``python download_sites.py`` in that directory to download sites to be used in the benchmark
- run ``python benchmark.py`` to run the benchmark

80 changes: 80 additions & 0 deletions splash/benchmark/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/usr/bin/env python

"""
Splash benchmark script.
It takes a directory downloaded with splash & httrack, fires up a static file
server and runs a series of requests via splash on those downloaded pages.
"""

import logging
import random
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
from glob import glob
from multiprocessing.pool import ThreadPool

import requests
from splash.file_server import serve_files
from splash.tests.utils import SplashServer

PORT = 8806
#: URLs to benchmark against.
PAGES = glob('localhost_8806/*.html')
#: Combinations of width & height to test.
WIDTH_HEIGHT = [(None, None), (500, None), (None, 500), (500, 500)]
# XXX: add benchmark of different API endpoints.
SPLASH_LOG = 'splash.log'

parser = ArgumentParser(description=__doc__,
formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument('--seed', type=int, default=1234, help='PRNG seed number')
parser.add_argument('--thread-count', type=int, default=1,
help='Request thread count')
parser.add_argument('--request-count', type=int, default=10,
help='Benchmark request count')


def generate_requests(splash, args):
log = logging.getLogger('generate_requests')
log.info("Using pRNG seed: %s", args.seed)
rng = random.Random(args.seed)
for i in xrange(args.request_count):
page = rng.choice(PAGES)
width, height = rng.choice(WIDTH_HEIGHT)
url = 'http://localhost:%d/%s' % (PORT, page)
yield (i + 1, args.request_count,
{'url': splash.url('render.png'),
'params': {'url': url, 'width': width, 'height': height}})


def parallel_map(func, iterable, thread_count):
if thread_count == 1:
return map(func, iterable)
else:
pool = ThreadPool(thread_count)
return pool.map(func, iterable)


def invoke_request(invoke_args):
log = logging.getLogger('bench_worker')
req_no, total_reqs, kwargs = invoke_args
log.info("Initiating request %d/%d: %s", req_no, total_reqs, kwargs)
return requests.get(**kwargs)


def main():
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG)

with SplashServer(logfile=SPLASH_LOG,
extra_args=['--disable-lua-sandbox',
'--disable-xvfb',
'--max-timeout=600']) as splash, \
serve_files(PORT):
parallel_map(invoke_request, generate_requests(splash, args),
args.thread_count)


if __name__ == '__main__':
main()
90 changes: 90 additions & 0 deletions splash/benchmark/download_sites.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from splash.tests.stress import lua_runonce

import re
from urlparse import urlsplit
import json
from lxml import html
import w3lib.html
import subprocess
from splash.file_server import serve_files

script_html = """
function main(splash)
splash:set_images_enabled(false)
splash:go(splash.args.url)
splash:wait(0.5)
return {url=splash:url(), html=splash:html()}
end
"""

script_png = """
function main(splash)
splash:go(splash.args.url)
splash:wait(0.5)
return splash:png()
end
"""


USERAGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.34 (KHTML, like Gecko) Qt/4.8.1 Safari/534.34"


PORT = 8806


def preprocess_main_page(url):
out = json.loads(lua_runonce(script_html, url=url,
splash_args=['--disable-lua-sandbox',
'--disable-xvfb',
'--max-timeout=600'],
timeout=600.,))
final_url = urlsplit(out['url'])._replace(query='', fragment='').geturl()
if not w3lib.html.get_base_url(out['html']):
out['html'] = w3lib.html.remove_tags_with_content(
out['html'], ('script',))
root = html.fromstring(out['html'], parser=html.HTMLParser(),
base_url=final_url)
try:
head = root.xpath('./head')[0]
except IndexError:
head = html.Element('head')
root.insert(0, head)
head.insert(0, html.Element('base', {'href': final_url}))
head.insert(0, html.Element('meta', {'charset': 'utf-8'}))
out['html'] = html.tostring(root, encoding='utf-8',
doctype='<!DOCTYPE html>')
filename = re.sub(r'[^\w]+', '_', url) + '.html'
with open(filename, 'w') as f:
f.write(out['html'])
return filename


def download_sites(sites):
local_files = [preprocess_main_page(s) for s in sites]

local_urls = [
'http://localhost:%(port)d/%(filename)s' % {
'port': PORT, 'filename': filename
}
for filename in local_files
]
args = ['--continue',
'--near', # Fetch referred non-html files.
'-%P', # Try parsing links in non-href/src sections
'-F', USERAGENT, # Emulate splash UA
'--depth=1']
subprocess.check_call(['httrack'] + args + local_urls)


if __name__ == '__main__':
with serve_files(PORT):
download_sites([
'http://www.wikipedia.org',
'http://www.google.com',
'http://www.reddit.com',
"http://w3.org",
"http://w3.org/TR/2010/REC-xhtml-basic-20101123/",
# "http://blog.pinterest.com",
# "http://imgur.com",
])
31 changes: 31 additions & 0 deletions splash/benchmark/file_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import SimpleHTTPServer
import SocketServer
import subprocess
import sys
from contextlib import contextmanager


class ReusingTCPServer(SocketServer.TCPServer):
allow_reuse_address = True


class RequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
def address_string(self):
return "fileserver"


@contextmanager
def serve_files(port):
"""Serve files from current directory statically in a subprocess."""
site_server = subprocess.Popen(['python', '-m', __name__,
str(port)])
try:
yield
finally:
site_server.terminate()


if __name__ == '__main__':
port = int(sys.argv[1])
server = ReusingTCPServer(("", port), RequestHandler)
server.serve_forever()

0 comments on commit 2480ef0

Please sign in to comment.