-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
guess_punct_space: remove whitespace before punct
This is similar to webstruct.utils.smart_joins (https://github.com/scrapinghub/webstruct/blob/5a3f39e2ec78a04ca021a12dff58f66686d86251/webstruct/utils.py#L61), but is applied only on the tag boundaries. This mode is just a little bit slower than default.
- Loading branch information
Showing
2 changed files
with
56 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,30 +1,49 @@ | ||
# -*- coding: utf-8 -*- | ||
import pytest | ||
|
||
from html_text import extract_text, parse_html | ||
|
||
|
||
def test_extract_text(): | ||
@pytest.fixture(params=[{'guess_punct_space': True}, | ||
{'guess_punct_space': False}]) | ||
def all_options(request): | ||
return request.param | ||
|
||
|
||
def test_extract_text(all_options): | ||
html = u'<html><style>.div {}</style><body><p>Hello, world!</body></html>' | ||
assert extract_text(html) == u'Hello, world!' | ||
assert extract_text(html, **all_options) == u'Hello, world!' | ||
|
||
|
||
def test_declared_encoding(): | ||
def test_declared_encoding(all_options): | ||
html = (u'<?xml version="1.0" encoding="utf-8" ?>' | ||
u'<html><style>.div {}</style>' | ||
u'<body>Hello, world!</p></body></html>') | ||
assert extract_text(html) == u'Hello, world!' | ||
assert extract_text(html, **all_options) == u'Hello, world!' | ||
|
||
|
||
def test_empty(): | ||
assert extract_text(u'') == '' | ||
def test_empty(all_options): | ||
assert extract_text(u'', **all_options) == '' | ||
|
||
|
||
def test_extract_text_from_tree(): | ||
def test_extract_text_from_tree(all_options): | ||
html = u'<html><style>.div {}</style><body><p>Hello, world!</body></html>' | ||
tree = parse_html(html) | ||
assert extract_text(tree) == u'Hello, world!' | ||
assert extract_text(tree, **all_options) == u'Hello, world!' | ||
|
||
|
||
def test_inline_tags_whitespace(all_options): | ||
html = u'<span>field</span><span>value of</span><span></span>' | ||
assert extract_text(html, **all_options) == u'field value of' | ||
|
||
|
||
def test_punct_whitespace(): | ||
html = u'<div><span>field</span>, and more</div>' | ||
assert extract_text(html) == u'field , and more' | ||
|
||
|
||
def test_inline_tags_whitespace(): | ||
html = u'<span>field</span><span>value</span>' | ||
assert extract_text(html) == u'field value' | ||
def test_punct_whitespace_preserved(): | ||
html = (u'<div><span>по</span><span>ле</span>, and , ' | ||
u'<span>more </span>!<span>now</div>a (<b>boo</b>)') | ||
assert (extract_text(html, guess_punct_space=True) == | ||
u'по ле, and , more ! now a (boo)') |