diff --git a/html_text/html_text.py b/html_text/html_text.py index 2b61a9a..408048c 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -41,18 +41,36 @@ def parse_html(html): return lxml.html.fromstring(html.encode('utf8'), parser=parser) -_whitespace = re.compile('\s+') +_whitespace = re.compile(r'\s+') +_trailing_whitespace = re.compile(r'\s$') +_punct_after = re.compile(r'^[,:;.!?"\)]') +_punct_before = re.compile(r'\($') -def selector_to_text(sel): +def selector_to_text(sel, guess_punct_space=False): """ Convert a cleaned selector to text. Almost the same as xpath normalize-space, but this also adds spaces between inline elements (like ) which are often used as block elements in html markup. """ - fragments = (_whitespace.sub(' ', x.strip()) - for x in sel.xpath('//text()').extract()) - return ' '.join(x for x in fragments if x) + if guess_punct_space: + + def fragments(): + prev = None + for text in sel.xpath('//text()').extract(): + if prev is not None and (_trailing_whitespace.search(prev) + or (not _punct_after.search(text) and + not _punct_before.search(prev))): + yield ' ' + yield text + prev = text + + return _whitespace.sub(' ', ''.join(fragments()).strip()) + + else: + fragments = (_whitespace.sub(' ', x.strip()) + for x in sel.xpath('//text()').extract()) + return ' '.join(x for x in fragments if x) def cleaned_selector(html): @@ -70,10 +88,11 @@ def cleaned_selector(html): return sel -def extract_text(html, encoding='utf8'): +def extract_text(html, guess_punct_space=False): """ Convert html to text. html should be a unicode string or an already parsed lxml.html element. """ - return selector_to_text(cleaned_selector(html)) + sel = cleaned_selector(html) + return selector_to_text(sel, guess_punct_space=guess_punct_space) diff --git a/tests/test_html_text.py b/tests/test_html_text.py index b5078e3..f2daac8 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -1,30 +1,49 @@ # -*- coding: utf-8 -*- +import pytest from html_text import extract_text, parse_html -def test_extract_text(): +@pytest.fixture(params=[{'guess_punct_space': True}, + {'guess_punct_space': False}]) +def all_options(request): + return request.param + + +def test_extract_text(all_options): html = u'

Hello, world!' - assert extract_text(html) == u'Hello, world!' + assert extract_text(html, **all_options) == u'Hello, world!' -def test_declared_encoding(): +def test_declared_encoding(all_options): html = (u'' u'' u'Hello, world!

') - assert extract_text(html) == u'Hello, world!' + assert extract_text(html, **all_options) == u'Hello, world!' -def test_empty(): - assert extract_text(u'') == '' +def test_empty(all_options): + assert extract_text(u'', **all_options) == '' -def test_extract_text_from_tree(): +def test_extract_text_from_tree(all_options): html = u'

Hello, world!' tree = parse_html(html) - assert extract_text(tree) == u'Hello, world!' + assert extract_text(tree, **all_options) == u'Hello, world!' + + +def test_inline_tags_whitespace(all_options): + html = u'fieldvalue of' + assert extract_text(html, **all_options) == u'field value of' + + +def test_punct_whitespace(): + html = u'

field, and more
' + assert extract_text(html) == u'field , and more' -def test_inline_tags_whitespace(): - html = u'fieldvalue' - assert extract_text(html) == u'field value' +def test_punct_whitespace_preserved(): + html = (u'
поле, and , ' + u'more !now
a (boo)') + assert (extract_text(html, guess_punct_space=True) == + u'по ле, and , more ! now a (boo)')