From 6135ba63bb36eb4dcea5c448ac583c39732ba0dd Mon Sep 17 00:00:00 2001
From: Konstantin Lopuhin <kostia.lopuhin@gmail.com>
Date: Fri, 26 May 2017 17:09:17 +0300
Subject: [PATCH 1/6] Add whitespace even for inline tags

Thanks @codinguncut for suggestion. Still needs testing.
re.sub is replicating xpath's normalize-space behaviour.
See GH-1
---
 html_text/html_text.py  | 9 ++++++++-
 tests/test_html_text.py | 5 +++++
 2 files changed, 13 insertions(+), 1 deletion(-)
diff --git a/html_text/html_text.py b/html_text/html_text.py
index 532c3ac..dfce3ca 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -1,4 +1,6 @@
 # -*- coding: utf-8 -*-
+import re
+
 import lxml
 import lxml.etree
 from lxml.html.clean import Cleaner
@@ -41,8 +43,13 @@ def parse_html(html):
 
 def selector_to_text(sel):
     """ Convert a cleaned selector to text.
+    Almost the same as xpath normalize-space, but this also
+    adds spaces between inline elements (like <span>) which are
+    often used as block elements in html markup.
     """
-    return sel.xpath('normalize-space()').extract_first('')
+    fragments = (re.sub('\s+', ' ', x.strip())
+                 for x in sel.xpath('//text()').extract())
+    return ' '.join(x for x in fragments if x)
 
 
 def cleaned_selector(html):
diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index eedba9a..b5078e3 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -23,3 +23,8 @@ def test_extract_text_from_tree():
     html = u'<html><style>.div {}</style><body><p>Hello,   world!</body></html>'
     tree = parse_html(html)
     assert extract_text(tree) == u'Hello, world!'
+
+
+def test_inline_tags_whitespace():
+    html = u'<span>field</span><span>value</span>'
+    assert extract_text(html) == u'field value'

From 43f1bd4c559b3835bcacd4b326f9ddf41f2bf836 Mon Sep 17 00:00:00 2001
From: Konstantin Lopuhin <kostia.lopuhin@gmail.com>
Date: Fri, 26 May 2017 18:42:33 +0300
Subject: [PATCH 2/6] Cache regexp

python 2 does not cache re.sub regexps,
and it's faster even on python 3
---
 html_text/html_text.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/html_text/html_text.py b/html_text/html_text.py
index dfce3ca..2b61a9a 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -41,13 +41,16 @@ def parse_html(html):
     return lxml.html.fromstring(html.encode('utf8'), parser=parser)
 
 
+_whitespace = re.compile('\s+')
+
+
 def selector_to_text(sel):
     """ Convert a cleaned selector to text.
     Almost the same as xpath normalize-space, but this also
     adds spaces between inline elements (like <span>) which are
     often used as block elements in html markup.
     """
-    fragments = (re.sub('\s+', ' ', x.strip())
+    fragments = (_whitespace.sub(' ', x.strip())
                  for x in sel.xpath('//text()').extract())
     return ' '.join(x for x in fragments if x)
 

From f020f4bf27a5c7f400b22165ff8cfdd93d3f2b68 Mon Sep 17 00:00:00 2001
From: Konstantin Lopuhin <kostia.lopuhin@gmail.com>
Date: Mon, 29 May 2017 12:22:34 +0300
Subject: [PATCH 3/6] guess_punct_space: remove whitespace before punct

This is similar to webstruct.utils.smart_joins
(https://github.com/scrapinghub/webstruct/blob/5a3f39e2ec78a04ca021a12dff58f66686d86251/webstruct/utils.py#L61),
but is applied only on the tag boundaries.
This mode is just a little bit slower than default.
---
 html_text/html_text.py  | 33 ++++++++++++++++++++++++++-------
 tests/test_html_text.py | 41 ++++++++++++++++++++++++++++++-----------
 2 files changed, 56 insertions(+), 18 deletions(-)

diff --git a/html_text/html_text.py b/html_text/html_text.py
index 2b61a9a..408048c 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -41,18 +41,36 @@ def parse_html(html):
     return lxml.html.fromstring(html.encode('utf8'), parser=parser)
 
 
-_whitespace = re.compile('\s+')
+_whitespace = re.compile(r'\s+')
+_trailing_whitespace = re.compile(r'\s$')
+_punct_after = re.compile(r'^[,:;.!?"\)]')
+_punct_before = re.compile(r'\($')
 
 
-def selector_to_text(sel):
+def selector_to_text(sel, guess_punct_space=False):
     """ Convert a cleaned selector to text.
     Almost the same as xpath normalize-space, but this also
     adds spaces between inline elements (like <span>) which are
     often used as block elements in html markup.
     """
-    fragments = (_whitespace.sub(' ', x.strip())
-                 for x in sel.xpath('//text()').extract())
-    return ' '.join(x for x in fragments if x)
+    if guess_punct_space:
+
+        def fragments():
+            prev = None
+            for text in sel.xpath('//text()').extract():
+                if prev is not None and (_trailing_whitespace.search(prev)
+                                         or (not _punct_after.search(text) and
+                                             not _punct_before.search(prev))):
+                    yield ' '
+                yield text
+                prev = text
+
+        return _whitespace.sub(' ', ''.join(fragments()).strip())
+
+    else:
+        fragments = (_whitespace.sub(' ', x.strip())
+                     for x in sel.xpath('//text()').extract())
+        return ' '.join(x for x in fragments if x)
 
 
 def cleaned_selector(html):
@@ -70,10 +88,11 @@ def cleaned_selector(html):
     return sel
 
 
-def extract_text(html, encoding='utf8'):
+def extract_text(html, guess_punct_space=False):
     """
     Convert html to text.
 
     html should be a unicode string or an already parsed lxml.html element.
     """
-    return selector_to_text(cleaned_selector(html))
+    sel = cleaned_selector(html)
+    return selector_to_text(sel, guess_punct_space=guess_punct_space)
diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index b5078e3..f2daac8 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -1,30 +1,49 @@
 # -*- coding: utf-8 -*-
+import pytest
 
 from html_text import extract_text, parse_html
 
 
-def test_extract_text():
+@pytest.fixture(params=[{'guess_punct_space': True},
+                        {'guess_punct_space': False}])
+def all_options(request):
+    return request.param
+
+
+def test_extract_text(all_options):
     html = u'<html><style>.div {}</style><body><p>Hello,   world!</body></html>'
-    assert extract_text(html) == u'Hello, world!'
+    assert extract_text(html, **all_options) == u'Hello, world!'
 
 
-def test_declared_encoding():
+def test_declared_encoding(all_options):
     html = (u'<?xml version="1.0" encoding="utf-8" ?>'
             u'<html><style>.div {}</style>'
             u'<body>Hello,   world!</p></body></html>')
-    assert extract_text(html) == u'Hello, world!'
+    assert extract_text(html, **all_options) == u'Hello, world!'
 
 
-def test_empty():
-    assert extract_text(u'') == ''
+def test_empty(all_options):
+    assert extract_text(u'', **all_options) == ''
 
 
-def test_extract_text_from_tree():
+def test_extract_text_from_tree(all_options):
     html = u'<html><style>.div {}</style><body><p>Hello,   world!</body></html>'
     tree = parse_html(html)
-    assert extract_text(tree) == u'Hello, world!'
+    assert extract_text(tree, **all_options) == u'Hello, world!'
+
+
+def test_inline_tags_whitespace(all_options):
+    html = u'<span>field</span><span>value  of</span><span></span>'
+    assert extract_text(html, **all_options) == u'field value of'
+
+
+def test_punct_whitespace():
+    html = u'<div><span>field</span>, and more</div>'
+    assert extract_text(html) == u'field , and more'
 
 
-def test_inline_tags_whitespace():
-    html = u'<span>field</span><span>value</span>'
-    assert extract_text(html) == u'field value'
+def test_punct_whitespace_preserved():
+    html = (u'<div><span>по</span><span>ле</span>, and  ,  '
+            u'<span>more </span>!<span>now</div>a (<b>boo</b>)')
+    assert (extract_text(html, guess_punct_space=True) ==
+            u'по ле, and , more ! now a (boo)')

From 73bf2acf66fa7e0bfdd7fc87e431b0d167040cf9 Mon Sep 17 00:00:00 2001
From: Konstantin Lopuhin <kostia.lopuhin@gmail.com>
Date: Mon, 29 May 2017 12:47:07 +0300
Subject: [PATCH 4/6] Slightly faster and cleaner default path

It's fine to apply whitespace cleaning regexp at the end
---
 html_text/html_text.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/html_text/html_text.py b/html_text/html_text.py
index 408048c..4bf81ec 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -68,9 +68,8 @@ def fragments():
         return _whitespace.sub(' ', ''.join(fragments()).strip())
 
     else:
-        fragments = (_whitespace.sub(' ', x.strip())
-                     for x in sel.xpath('//text()').extract())
-        return ' '.join(x for x in fragments if x)
+        fragments = (x.strip() for x in sel.xpath('//text()').extract())
+        return _whitespace.sub(' ', ' '.join(x for x in fragments if x))
 
 
 def cleaned_selector(html):

From e9cf9b8647a8804ddfa873c2597949f8ab2eba9f Mon Sep 17 00:00:00 2001
From: Konstantin Lopuhin <kostia.lopuhin@gmail.com>
Date: Mon, 29 May 2017 15:17:51 +0300
Subject: [PATCH 5/6] Cache method lookup, more readable loop conditions

Thanks for the idea @kmike!
---
 html_text/html_text.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/html_text/html_text.py b/html_text/html_text.py
index 4bf81ec..465ebad 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -42,9 +42,9 @@ def parse_html(html):
 
 
 _whitespace = re.compile(r'\s+')
-_trailing_whitespace = re.compile(r'\s$')
-_punct_after = re.compile(r'^[,:;.!?"\)]')
-_punct_before = re.compile(r'\($')
+_has_trailing_whitespace = re.compile(r'\s$').search
+_has_punct_after = re.compile(r'^[,:;.!?"\)]').search
+_has_punct_before = re.compile(r'\($').search
 
 
 def selector_to_text(sel, guess_punct_space=False):
@@ -58,9 +58,9 @@ def selector_to_text(sel, guess_punct_space=False):
         def fragments():
             prev = None
             for text in sel.xpath('//text()').extract():
-                if prev is not None and (_trailing_whitespace.search(prev)
-                                         or (not _punct_after.search(text) and
-                                             not _punct_before.search(prev))):
+                if prev is not None and (_has_trailing_whitespace(prev)
+                                         or (not _has_punct_after(text) and
+                                             not _has_punct_before(prev))):
                     yield ' '
                 yield text
                 prev = text

From 1fb2ec4e8044ff35b44e6a8b7382a939e1cab542 Mon Sep 17 00:00:00 2001
From: Konstantin Lopuhin <kostia.lopuhin@gmail.com>
Date: Mon, 29 May 2017 15:20:01 +0300
Subject: [PATCH 6/6] Make guess_punct_space=True by default, document

---
 README.rst              |  4 ++++
 html_text/html_text.py  | 15 ++++++++++-----
 tests/test_html_text.py |  2 +-
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/README.rst b/README.rst
index 4c24d79..19e14d1 100644
--- a/README.rst
+++ b/README.rst
@@ -25,6 +25,10 @@ How is html_text different from ``.xpath('//text()')`` from LXML
 or ``.get_text()`` from Beautiful Soup?
 Text extracted with ``html_text`` does not contain inline styles,
 javascript, comments and other text that is not normally visible to the users.
+It normalizes whitespace, but is also smarter than ``.xpath('normalize-space())``,
+adding spaces around inline elements too
+(which are often used as block elements in html markup),
+and tries to avoid adding extra spaces for punctuation.
 
 
 Install
diff --git a/html_text/html_text.py b/html_text/html_text.py
index 465ebad..56d220e 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -47,11 +47,9 @@ def parse_html(html):
 _has_punct_before = re.compile(r'\($').search
 
 
-def selector_to_text(sel, guess_punct_space=False):
+def selector_to_text(sel, guess_punct_space=True):
     """ Convert a cleaned selector to text.
-    Almost the same as xpath normalize-space, but this also
-    adds spaces between inline elements (like <span>) which are
-    often used as block elements in html markup.
+    See html_text.extract_text docstring for description of the approach and options.
     """
     if guess_punct_space:
 
@@ -87,9 +85,16 @@ def cleaned_selector(html):
     return sel
 
 
-def extract_text(html, guess_punct_space=False):
+def extract_text(html, guess_punct_space=True):
     """
     Convert html to text.
+    Almost the same as normalize-space xpath, but this also
+    adds spaces between inline elements (like <span>) which are
+    often used as block elements in html markup.
+
+    When guess_punct_space is True (default), no extra whitespace is added
+    for punctuation. This has a slight (around 10%) performance overhead
+    and is just a heuristic.
 
     html should be a unicode string or an already parsed lxml.html element.
     """
diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index f2daac8..1205da7 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -39,7 +39,7 @@ def test_inline_tags_whitespace(all_options):
 
 def test_punct_whitespace():
     html = u'<div><span>field</span>, and more</div>'
-    assert extract_text(html) == u'field , and more'
+    assert extract_text(html, guess_punct_space=False) == u'field , and more'
 
 
 def test_punct_whitespace_preserved():