pull/23/head
Richard Harding 11 years ago
parent 17270db5f0
commit 347f3ea0b5

@ -49,6 +49,13 @@ clean_all: clean_venv
fi
bin/flake8: venv
bin/pip install flake8
lint: bin/flake8
flake8 breadability
# ###########
# Deploy
# ###########

@ -19,9 +19,13 @@ string_types = (bytes, unicode,)
try:
# Assert to hush pyflakes about the unused import. This is a _compat
# module and we expect this to aid in other code importing urllib.
import urllib2 as urllib
assert urllib
except ImportError:
import urllib.request as urllib
assert urllib
def unicode_compatible(cls):

@ -30,6 +30,9 @@ logger = logging.getLogger("breadability")
TAG_MARK_PATTERN = re.compile(to_bytes(r"</?[^>]*>\s*"))
UTF8_PARSER = HTMLParser(encoding="utf8")
def determine_encoding(page):
encoding = "utf8"
text = TAG_MARK_PATTERN.sub(to_bytes(" "), page)
@ -54,7 +57,12 @@ def determine_encoding(page):
return encoding
BREAK_TAGS_PATTERN = re.compile(to_unicode(r"(?:<\s*[bh]r[^>]*>\s*)+"), re.IGNORECASE)
BREAK_TAGS_PATTERN = re.compile(
to_unicode(r"(?:<\s*[bh]r[^>]*>\s*)+"),
re.IGNORECASE
)
def convert_breaks_to_paragraphs(html):
"""
Converts <hr> tag and multiple <br> tags into paragraph.
@ -75,7 +83,6 @@ def _replace_break_tags(match):
return tags
UTF8_PARSER = HTMLParser(encoding="utf8")
def build_document(html_content, base_href=None):
"""Requires that the `html_content` not be None"""
assert html_content is not None

Loading…
Cancel
Save