From 6135ba63bb36eb4dcea5c448ac583c39732ba0dd Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Fri, 26 May 2017 17:09:17 +0300 Subject: [PATCH 1/6] Add whitespace even for inline tags Thanks @codinguncut for suggestion. Still needs testing. re.sub is replicating xpath's normalize-space behaviour. See GH-1 --- html_text/html_text.py | 9 ++++++++- tests/test_html_text.py | 5 +++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index 532c3ac..dfce3ca 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -1,4 +1,6 @@ # -*- coding: utf-8 -*- +import re + import lxml import lxml.etree from lxml.html.clean import Cleaner @@ -41,8 +43,13 @@ def parse_html(html): def selector_to_text(sel): """ Convert a cleaned selector to text. + Almost the same as xpath normalize-space, but this also + adds spaces between inline elements (like ) which are + often used as block elements in html markup. """ - return sel.xpath('normalize-space()').extract_first('') + fragments = (re.sub('\s+', ' ', x.strip()) + for x in sel.xpath('//text()').extract()) + return ' '.join(x for x in fragments if x) def cleaned_selector(html): diff --git a/tests/test_html_text.py b/tests/test_html_text.py index eedba9a..b5078e3 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -23,3 +23,8 @@ def test_extract_text_from_tree(): html = u'

Hello, world!' tree = parse_html(html) assert extract_text(tree) == u'Hello, world!' + + +def test_inline_tags_whitespace(): + html = u'fieldvalue' + assert extract_text(html) == u'field value' From 43f1bd4c559b3835bcacd4b326f9ddf41f2bf836 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Fri, 26 May 2017 18:42:33 +0300 Subject: [PATCH 2/6] Cache regexp python 2 does not cache re.sub regexps, and it's faster even on python 3 --- html_text/html_text.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index dfce3ca..2b61a9a 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -41,13 +41,16 @@ def parse_html(html): return lxml.html.fromstring(html.encode('utf8'), parser=parser) +_whitespace = re.compile('\s+') + + def selector_to_text(sel): """ Convert a cleaned selector to text. Almost the same as xpath normalize-space, but this also adds spaces between inline elements (like ) which are often used as block elements in html markup. """ - fragments = (re.sub('\s+', ' ', x.strip()) + fragments = (_whitespace.sub(' ', x.strip()) for x in sel.xpath('//text()').extract()) return ' '.join(x for x in fragments if x) From f020f4bf27a5c7f400b22165ff8cfdd93d3f2b68 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Mon, 29 May 2017 12:22:34 +0300 Subject: [PATCH 3/6] guess_punct_space: remove whitespace before punct This is similar to webstruct.utils.smart_joins (https://github.com/scrapinghub/webstruct/blob/5a3f39e2ec78a04ca021a12dff58f66686d86251/webstruct/utils.py#L61), but is applied only on the tag boundaries. This mode is just a little bit slower than default. --- html_text/html_text.py | 33 ++++++++++++++++++++++++++------- tests/test_html_text.py | 41 ++++++++++++++++++++++++++++++----------- 2 files changed, 56 insertions(+), 18 deletions(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index 2b61a9a..408048c 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -41,18 +41,36 @@ def parse_html(html): return lxml.html.fromstring(html.encode('utf8'), parser=parser) -_whitespace = re.compile('\s+') +_whitespace = re.compile(r'\s+') +_trailing_whitespace = re.compile(r'\s$') +_punct_after = re.compile(r'^[,:;.!?"\)]') +_punct_before = re.compile(r'\($') -def selector_to_text(sel): +def selector_to_text(sel, guess_punct_space=False): """ Convert a cleaned selector to text. Almost the same as xpath normalize-space, but this also adds spaces between inline elements (like ) which are often used as block elements in html markup. """ - fragments = (_whitespace.sub(' ', x.strip()) - for x in sel.xpath('//text()').extract()) - return ' '.join(x for x in fragments if x) + if guess_punct_space: + + def fragments(): + prev = None + for text in sel.xpath('//text()').extract(): + if prev is not None and (_trailing_whitespace.search(prev) + or (not _punct_after.search(text) and + not _punct_before.search(prev))): + yield ' ' + yield text + prev = text + + return _whitespace.sub(' ', ''.join(fragments()).strip()) + + else: + fragments = (_whitespace.sub(' ', x.strip()) + for x in sel.xpath('//text()').extract()) + return ' '.join(x for x in fragments if x) def cleaned_selector(html): @@ -70,10 +88,11 @@ def cleaned_selector(html): return sel -def extract_text(html, encoding='utf8'): +def extract_text(html, guess_punct_space=False): """ Convert html to text. html should be a unicode string or an already parsed lxml.html element. """ - return selector_to_text(cleaned_selector(html)) + sel = cleaned_selector(html) + return selector_to_text(sel, guess_punct_space=guess_punct_space) diff --git a/tests/test_html_text.py b/tests/test_html_text.py index b5078e3..f2daac8 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -1,30 +1,49 @@ # -*- coding: utf-8 -*- +import pytest from html_text import extract_text, parse_html -def test_extract_text(): +@pytest.fixture(params=[{'guess_punct_space': True}, + {'guess_punct_space': False}]) +def all_options(request): + return request.param + + +def test_extract_text(all_options): html = u'

Hello, world!' - assert extract_text(html) == u'Hello, world!' + assert extract_text(html, **all_options) == u'Hello, world!' -def test_declared_encoding(): +def test_declared_encoding(all_options): html = (u'' u'' u'Hello, world!

') - assert extract_text(html) == u'Hello, world!' + assert extract_text(html, **all_options) == u'Hello, world!' -def test_empty(): - assert extract_text(u'') == '' +def test_empty(all_options): + assert extract_text(u'', **all_options) == '' -def test_extract_text_from_tree(): +def test_extract_text_from_tree(all_options): html = u'

Hello, world!' tree = parse_html(html) - assert extract_text(tree) == u'Hello, world!' + assert extract_text(tree, **all_options) == u'Hello, world!' + + +def test_inline_tags_whitespace(all_options): + html = u'fieldvalue of' + assert extract_text(html, **all_options) == u'field value of' + + +def test_punct_whitespace(): + html = u'

field, and more
' + assert extract_text(html) == u'field , and more' -def test_inline_tags_whitespace(): - html = u'fieldvalue' - assert extract_text(html) == u'field value' +def test_punct_whitespace_preserved(): + html = (u'
поле, and , ' + u'more !now
a (boo)') + assert (extract_text(html, guess_punct_space=True) == + u'по ле, and , more ! now a (boo)') From 73bf2acf66fa7e0bfdd7fc87e431b0d167040cf9 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Mon, 29 May 2017 12:47:07 +0300 Subject: [PATCH 4/6] Slightly faster and cleaner default path It's fine to apply whitespace cleaning regexp at the end --- html_text/html_text.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index 408048c..4bf81ec 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -68,9 +68,8 @@ def fragments(): return _whitespace.sub(' ', ''.join(fragments()).strip()) else: - fragments = (_whitespace.sub(' ', x.strip()) - for x in sel.xpath('//text()').extract()) - return ' '.join(x for x in fragments if x) + fragments = (x.strip() for x in sel.xpath('//text()').extract()) + return _whitespace.sub(' ', ' '.join(x for x in fragments if x)) def cleaned_selector(html): From e9cf9b8647a8804ddfa873c2597949f8ab2eba9f Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Mon, 29 May 2017 15:17:51 +0300 Subject: [PATCH 5/6] Cache method lookup, more readable loop conditions Thanks for the idea @kmike! --- html_text/html_text.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index 4bf81ec..465ebad 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -42,9 +42,9 @@ def parse_html(html): _whitespace = re.compile(r'\s+') -_trailing_whitespace = re.compile(r'\s$') -_punct_after = re.compile(r'^[,:;.!?"\)]') -_punct_before = re.compile(r'\($') +_has_trailing_whitespace = re.compile(r'\s$').search +_has_punct_after = re.compile(r'^[,:;.!?"\)]').search +_has_punct_before = re.compile(r'\($').search def selector_to_text(sel, guess_punct_space=False): @@ -58,9 +58,9 @@ def selector_to_text(sel, guess_punct_space=False): def fragments(): prev = None for text in sel.xpath('//text()').extract(): - if prev is not None and (_trailing_whitespace.search(prev) - or (not _punct_after.search(text) and - not _punct_before.search(prev))): + if prev is not None and (_has_trailing_whitespace(prev) + or (not _has_punct_after(text) and + not _has_punct_before(prev))): yield ' ' yield text prev = text From 1fb2ec4e8044ff35b44e6a8b7382a939e1cab542 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Mon, 29 May 2017 15:20:01 +0300 Subject: [PATCH 6/6] Make guess_punct_space=True by default, document --- README.rst | 4 ++++ html_text/html_text.py | 15 ++++++++++----- tests/test_html_text.py | 2 +- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/README.rst b/README.rst index 4c24d79..19e14d1 100644 --- a/README.rst +++ b/README.rst @@ -25,6 +25,10 @@ How is html_text different from ``.xpath('//text()')`` from LXML or ``.get_text()`` from Beautiful Soup? Text extracted with ``html_text`` does not contain inline styles, javascript, comments and other text that is not normally visible to the users. +It normalizes whitespace, but is also smarter than ``.xpath('normalize-space())``, +adding spaces around inline elements too +(which are often used as block elements in html markup), +and tries to avoid adding extra spaces for punctuation. Install diff --git a/html_text/html_text.py b/html_text/html_text.py index 465ebad..56d220e 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -47,11 +47,9 @@ def parse_html(html): _has_punct_before = re.compile(r'\($').search -def selector_to_text(sel, guess_punct_space=False): +def selector_to_text(sel, guess_punct_space=True): """ Convert a cleaned selector to text. - Almost the same as xpath normalize-space, but this also - adds spaces between inline elements (like ) which are - often used as block elements in html markup. + See html_text.extract_text docstring for description of the approach and options. """ if guess_punct_space: @@ -87,9 +85,16 @@ def cleaned_selector(html): return sel -def extract_text(html, guess_punct_space=False): +def extract_text(html, guess_punct_space=True): """ Convert html to text. + Almost the same as normalize-space xpath, but this also + adds spaces between inline elements (like ) which are + often used as block elements in html markup. + + When guess_punct_space is True (default), no extra whitespace is added + for punctuation. This has a slight (around 10%) performance overhead + and is just a heuristic. html should be a unicode string or an already parsed lxml.html element. """ diff --git a/tests/test_html_text.py b/tests/test_html_text.py index f2daac8..1205da7 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -39,7 +39,7 @@ def test_inline_tags_whitespace(all_options): def test_punct_whitespace(): html = u'
field, and more
' - assert extract_text(html) == u'field , and more' + assert extract_text(html, guess_punct_space=False) == u'field , and more' def test_punct_whitespace_preserved():