pull/31/merge
pictuga 6 years ago committed by GitHub
commit a5be4ac2f1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -85,7 +85,7 @@ def get_link_density(node, node_text=None):
"""
if node_text is None:
node_text = node.text_content()
node_text = normalize_whitespace(node_text.strip())
node_text = normalize_whitespace(node_text)
text_length = len(node_text)
if text_length == 0:
@ -101,7 +101,7 @@ def get_link_density(node, node_text=None):
def _get_normalized_text_length(node):
return len(normalize_whitespace(node.text_content().strip()))
return len(normalize_whitespace(node.text_content()))
def get_class_weight(node):

@ -18,9 +18,6 @@ except ImportError:
pass
MULTIPLE_WHITESPACE_PATTERN = re.compile(r"\s+", re.UNICODE)
def is_blank(text):
"""
Returns ``True`` if string contains only whitespace characters
@ -29,26 +26,14 @@ def is_blank(text):
return not text or text.isspace()
def shrink_text(text):
return normalize_whitespace(text.strip())
def normalize_whitespace(text):
"""
Translates multiple whitespace into single space character.
If there is at least one new line character chunk is replaced
by single LF (Unix new line) character.
"""
return MULTIPLE_WHITESPACE_PATTERN.sub(_replace_whitespace, text)
return ' '.join(text.split())
def _replace_whitespace(match):
text = match.group()
if "\n" in text or "\r" in text:
return "\n"
else:
return " "
shrink_text = normalize_whitespace
def cached_property(getter):

Loading…
Cancel
Save