diff --git a/Makefile b/Makefile index e7cbf54..eaf10c1 100644 --- a/Makefile +++ b/Makefile @@ -34,8 +34,8 @@ clean_venv: rm -rf bin include lib local man share .PHONY: develop -develop: lib/python*/site-packages/readability.egg-link -lib/python*/site-packages/readability.egg-link: +develop: lib/python*/site-packages/breadability.egg-link +lib/python*/site-packages/breadability.egg-link: $(PY) setup.py develop diff --git a/README.rst b/README.rst index 21d6efc..5f8a13e 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -Readability.py - another readability Python port +breadability - another readability Python port ================================================= .. image:: https://api.travis-ci.org/bookieio/breadability.png?branch=master :target: https://travis-ci.org/bookieio/breadability.py diff --git a/readability/__init__.py b/breadability/__init__.py similarity index 68% rename from readability/__init__.py rename to breadability/__init__.py index d3ba9fb..1cfe60b 100644 --- a/readability/__init__.py +++ b/breadability/__init__.py @@ -8,4 +8,4 @@ from __future__ import ( ) import pkg_resources -__version__ = pkg_resources.get_distribution("readability").version +__version__ = pkg_resources.get_distribution("breadability").version diff --git a/readability/_compat.py b/breadability/_compat.py similarity index 99% rename from readability/_compat.py rename to breadability/_compat.py index c6496d1..76f5c98 100644 --- a/readability/_compat.py +++ b/breadability/_compat.py @@ -56,7 +56,6 @@ def to_bytes(object): return to_bytes(repr(object)) - def to_unicode(object): try: if isinstance(object, unicode): diff --git a/readability/annotated_text.py b/breadability/annotated_text.py similarity index 100% rename from readability/annotated_text.py rename to breadability/annotated_text.py diff --git a/readability/document.py b/breadability/document.py similarity index 98% rename from readability/document.py rename to breadability/document.py index ccf594c..31780c7 100644 --- a/readability/document.py +++ b/breadability/document.py @@ -15,7 +15,7 @@ from ._compat import unicode, to_bytes, to_unicode, unicode_compatible from .utils import cached_property -logger = logging.getLogger("readability") +logger = logging.getLogger("breadability") TAG_MARK_PATTERN = re.compile(to_bytes(r"]*>\s*")) diff --git a/readability/readable.py b/breadability/readable.py similarity index 90% rename from readability/readable.py rename to breadability/readable.py index 4d297dd..e6ed4b4 100644 --- a/readability/readable.py +++ b/breadability/readable.py @@ -13,12 +13,17 @@ from lxml.html import fragment_fromstring, fromstring from .document import OriginalDocument from .annotated_text import AnnotatedTextHandler -from .scoring import (score_candidates, get_link_density, get_class_weight, - is_unlikely_node) +from .scoring import ( + get_class_weight, + get_link_density, + is_unlikely_node, + score_candidates, +) from .utils import cached_property, shrink_text -html_cleaner = Cleaner(scripts=True, javascript=True, comments=True, +html_cleaner = Cleaner( + scripts=True, javascript=True, comments=True, style=True, links=True, meta=False, add_nofollow=False, page_structure=False, processing_instructions=True, embedded=False, frames=False, forms=False, @@ -44,7 +49,7 @@ NULL_DOCUMENT = """ """ -logger = logging.getLogger("readability") +logger = logging.getLogger("breadability") def ok_embedded_video(node): @@ -129,7 +134,8 @@ def check_siblings(candidate_node, candidate_list): content_bonus += candidate_node.content_score * 0.2 if sibling in candidate_list: - adjusted_score = candidate_list[sibling].content_score + content_bonus + adjusted_score = \ + candidate_list[sibling].content_score + content_bonus if adjusted_score >= sibling_target_score: append = True @@ -146,7 +152,8 @@ def check_siblings(candidate_node, candidate_list): append = True if append: - logger.debug("Sibling appended: %s %r", sibling.tag, sibling.attrib) + logger.debug( + "Sibling appended: %s %r", sibling.tag, sibling.attrib) if sibling.tag not in ("div", "p"): # We have a node that isn't a common block level element, like # a form or td tag. Turn it into a div so it doesn't get @@ -191,7 +198,8 @@ def clean_document(node): if n.tag in ("div", "p"): text_content = shrink_text(n.text_content()) if len(text_content) < 5 and not n.getchildren(): - logger.debug("Dropping %s %r without content.", n.tag, n.attrib) + logger.debug( + "Dropping %s %r without content.", n.tag, n.attrib) to_drop.append(n) # finally try out the conditional cleaning of the target node @@ -206,7 +214,8 @@ def clean_document(node): def drop_nodes_with_parents(nodes): for node in nodes: if node.getparent() is not None: - logger.debug("Droping node with parent %s %r", node.tag, node.attrib) + logger.debug( + "Droping node with parent %s %r", node.tag, node.attrib) node.drop_tree() @@ -231,7 +240,8 @@ def clean_conditionally(node): commas_count = node.text_content().count(',') if commas_count < 10: - logger.debug("There are %d commas so we're processing more.", commas_count) + logger.debug( + "There are %d commas so we're processing more.", commas_count) # If there are not very many commas, and the number of # non-paragraph elements is more than paragraphs or other ominous @@ -267,7 +277,8 @@ def clean_conditionally(node): logger.debug('Conditional drop: weight big but link heavy') remove_node = True elif (embed == 1 and content_length < 75) or embed > 1: - logger.debug('Conditional drop: embed w/o much content or many embed') + logger.debug( + 'Conditional drop: embed w/o much content or many embed') remove_node = True if remove_node: @@ -305,10 +316,12 @@ def find_candidates(document): for node in document.iter(): if is_unlikely_node(node): - logger.debug("We should drop unlikely: %s %r", node.tag, node.attrib) + logger.debug( + "We should drop unlikely: %s %r", node.tag, node.attrib) should_remove.add(node) elif is_bad_link(node): - logger.debug("We should drop bad link: %s %r", node.tag, node.attrib) + logger.debug( + "We should drop bad link: %s %r", node.tag, node.attrib) should_remove.add(node) elif node.tag in SCORABLE_TAGS: nodes_to_score.add(node) @@ -403,7 +416,8 @@ class Article(object): return self._handle_no_candidates() # right now we return the highest scoring candidate content - best_candidates = sorted((c for c in self.candidates.values()), + best_candidates = sorted( + (c for c in self.candidates.values()), key=attrgetter("content_score"), reverse=True) printer = PrettyPrinter(indent=2) @@ -415,9 +429,11 @@ class Article(object): updated_winner = check_siblings(winner, self.candidates) updated_winner.node = prep_article(updated_winner.node) if updated_winner.node is not None: - dom = build_base_document(updated_winner.node, self._return_fragment) + dom = build_base_document( + updated_winner.node, self._return_fragment) else: - logger.warning('Had candidates but failed to find a cleaned winning DOM.') + logger.warning( + 'Had candidates but failed to find a cleaned winning DOM.') dom = self._handle_no_candidates() return self._remove_orphans(dom.get_element_by_id("readabilityBody")) @@ -437,7 +453,8 @@ class Article(object): if self.dom is not None and len(self.dom): dom = prep_article(self.dom) dom = build_base_document(dom, self._return_fragment) - return self._remove_orphans(dom.get_element_by_id("readabilityBody")) + return self._remove_orphans( + dom.get_element_by_id("readabilityBody")) else: logger.warning("No document to use.") return build_error_document(self._return_fragment) @@ -454,7 +471,8 @@ def leaf_div_elements_into_paragraphs(document): for element in document.iter(tag="div"): child_tags = tuple(n.tag for n in element.getchildren()) if "div" not in child_tags and "p" not in child_tags: - logger.debug("Changing leaf block element <%s> into

", element.tag) + logger.debug( + "Changing leaf block element <%s> into

", element.tag) element.tag = "p" return document diff --git a/readability/scoring.py b/breadability/scoring.py similarity index 88% rename from readability/scoring.py rename to breadability/scoring.py index 3b90db8..2eaa728 100644 --- a/readability/scoring.py +++ b/breadability/scoring.py @@ -17,9 +17,9 @@ from .utils import normalize_whitespace # A series of sets of attributes we check to help in determining if a node is # a potential candidate or not. CLS_UNLIKELY = re.compile( - "combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|" - "sidebar|sponsor|ad-break|agegate|pagination|pager|perma|popup|tweet|" - "twitter|social|breadcrumb", + "combx|comment|community|disqus|extra|foot|header|menu|remark|rss|" + "shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|perma|popup|" + "tweet|twitter|social|breadcrumb", re.IGNORECASE ) CLS_MAYBE = re.compile( @@ -32,12 +32,12 @@ CLS_WEIGHT_POSITIVE = re.compile( ) CLS_WEIGHT_NEGATIVE = re.compile( "combx|comment|com-|contact|foot|footer|footnote|head|masthead|media|meta|" - "outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|" - "widget", + "outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|" + "tool|widget", re.IGNORECASE ) -logger = logging.getLogger("readability") +logger = logging.getLogger("breadability") def check_node_attributes(pattern, node, *attributes): @@ -52,6 +52,7 @@ def check_node_attributes(pattern, node, *attributes): return False + def generate_hash_id(node): """ Generates a hash_id for the node in question. @@ -60,7 +61,7 @@ def generate_hash_id(node): """ try: content = tostring(node) - except Exception as e: + except Exception: logger.exception("Generating of hash failed") content = to_bytes(repr(node)) @@ -90,14 +91,15 @@ def get_link_density(node, node_text=None): if text_length == 0: return 0.0 - - link_length = sum([len(a.text_content()) or 0 - for a in node.findall(".//a")]) + link_length = sum( + [len(a.text_content()) or 0 for a in node.findall(".//a")] + ) # For each img, give 50 bonus chars worth of length. # Tweaking this 50 down a notch should help if we hit false positives. - links_length = max(link_length - - sum([50 for img in node.findall(".//img")]), 0) + links_length = max( + link_length - sum([50 for img in node.findall(".//img")]), 0 + ) return links_length / text_length @@ -148,8 +150,8 @@ def score_candidates(nodes): for node in nodes: logger.debug("* Scoring candidate %s %r", node.tag, node.attrib) - # if the node has no parent it knows of - # then it ends up creating a body & html tag to parent the html fragment + # if the node has no parent it knows of then it ends up creating a + # body & html tag to parent the html fragment parent = node.getparent() if parent is None: logger.debug("Skipping candidate - parent node is 'None'.") @@ -163,7 +165,9 @@ def score_candidates(nodes): # if paragraph is < `MIN_HIT_LENTH` characters don't even count it inner_text = node.text_content().strip() if len(inner_text) < MIN_HIT_LENTH: - logger.debug("Skipping candidate - inner text < %d characters.", MIN_HIT_LENTH) + logger.debug( + "Skipping candidate - inner text < %d characters.", + MIN_HIT_LENTH) continue # initialize readability data for the parent @@ -186,7 +190,8 @@ def score_candidates(nodes): # subtract 0.5 points for each double quote within this paragraph double_quotes_count = inner_text.count('"') content_score += double_quotes_count * -0.5 - logger.debug("Penalty points for %d double-quotes.", double_quotes_count) + logger.debug( + "Penalty points for %d double-quotes.", double_quotes_count) # for every 100 characters in this paragraph, add another point # up to 3 points @@ -195,12 +200,14 @@ def score_candidates(nodes): logger.debug("Bonus points for length of text: %f", length_points) # add the score to the parent - logger.debug("Bonus points for parent %s %r with score %f: %f", + logger.debug( + "Bonus points for parent %s %r with score %f: %f", parent.tag, parent.attrib, candidates[parent].content_score, content_score) candidates[parent].content_score += content_score # the grand node gets half - logger.debug("Bonus points for grand %s %r with score %f: %f", + logger.debug( + "Bonus points for grand %s %r with score %f: %f", grand.tag, grand.attrib, candidates[grand].content_score, content_score / 2.0) candidates[grand].content_score += content_score / 2.0 @@ -212,7 +219,8 @@ def score_candidates(nodes): for candidate in candidates.values(): adjustment = 1.0 - get_link_density(candidate.node) candidate.content_score *= adjustment - logger.debug("Link density adjustment for %s %r: %f", + logger.debug( + "Link density adjustment for %s %r: %f", candidate.node.tag, candidate.node.attrib, adjustment) return candidates diff --git a/readability/scripts/__init__.py b/breadability/scripts/__init__.py similarity index 100% rename from readability/scripts/__init__.py rename to breadability/scripts/__init__.py diff --git a/readability/scripts/client.py b/breadability/scripts/client.py similarity index 94% rename from readability/scripts/client.py rename to breadability/scripts/client.py index 8025b98..7155985 100644 --- a/readability/scripts/client.py +++ b/breadability/scripts/client.py @@ -4,9 +4,9 @@ A fast python port of arc90's readability tool Usage: - readability [options] - readability --version - readability --help + breadability [options] + breadability --version + breadability --help Arguments: URL or file path to process in readable form. @@ -50,7 +50,7 @@ def parse_args(): def main(): args = parse_args() - logger = logging.getLogger("readability") + logger = logging.getLogger("breadability") if args["--verbose"]: logger.setLevel(logging.DEBUG) diff --git a/readability/scripts/test_helper.py b/breadability/scripts/test_helper.py similarity index 93% rename from readability/scripts/test_helper.py rename to breadability/scripts/test_helper.py index a9e40da..5fd2dae 100644 --- a/readability/scripts/test_helper.py +++ b/breadability/scripts/test_helper.py @@ -1,12 +1,12 @@ # -*- coding: utf8 -*- """ -Helper to generate a new set of article test files for readability. +Helper to generate a new set of article test files for breadability. Usage: - readability_test --name - readability_test --version - readability_test --help + breadability_test --name + breadability_test --version + breadability_test --help Arguments: The url of content to fetch for the article.html @@ -39,7 +39,7 @@ from __future__ import absolute_import from __future__ import division, print_function, unicode_literals from os.path import join, dirname -from readability.readable import Article +from breadability.readable import Article from ...compat import unittest diff --git a/readability/utils.py b/breadability/utils.py similarity index 99% rename from readability/utils.py rename to breadability/utils.py index 8fb55ff..7385d9e 100644 --- a/readability/utils.py +++ b/breadability/utils.py @@ -6,6 +6,9 @@ from __future__ import division, print_function, unicode_literals import re +MULTIPLE_WHITESPACE_PATTERN = re.compile(r"\s+", re.UNICODE) + + def is_blank(text): """ Returns ``True`` if string contains only whitespace characters @@ -18,7 +21,6 @@ def shrink_text(text): return normalize_whitespace(text.strip()) -MULTIPLE_WHITESPACE_PATTERN = re.compile(r"\s+", re.UNICODE) def normalize_whitespace(text): """ Translates multiple whitespace into single space character. diff --git a/setup.py b/setup.py index bdb0dac..70328a1 100644 --- a/setup.py +++ b/setup.py @@ -29,10 +29,10 @@ if sys.version_info < (2, 7): install_requires.append("unittest2") console_script_targets = [ - "readability = readability.scripts.client:main", - "readability-{0} = readability.scripts.client:main", - "readability_test = readability.scripts.test_helper:main", - "readability_test-{0} = readability.scripts.test_helper:main", + "breadability = breadability.scripts.client:main", + "breadability-{0} = breadability.scripts.client:main", + "breadability_test = breadability.scripts.test_helper:main", + "breadability_test-{0} = breadability.scripts.test_helper:main", ] console_script_targets = [ target.format(VERSION_SUFFIX) for target in console_script_targets @@ -40,21 +40,22 @@ console_script_targets = [ setup( - name="readability", + name="breadability", version=VERSION, description="Port of Readability HTML parser in Python", long_description=long_description, keywords=[ + "bookie", + "breadability", + "content", + "HTML", + "parsing", "readability", "readable", - "parsing", - "HTML", - "content", - "bookie", ], author="Rick Harding", author_email="rharding@mitechie.com", - url="https://github.com/bookieio/b readability", + url="https://github.com/bookieio/breadability", license="BSD", classifiers=[ "Development Status :: 5 - Production/Stable", diff --git a/tests/run_tests.py b/tests/run_tests.py index 9bc85cd..d6db309 100644 --- a/tests/run_tests.py +++ b/tests/run_tests.py @@ -12,7 +12,7 @@ from os.path import dirname, abspath DEFAULT_PARAMS = [ "nosetests", "--with-coverage", - "--cover-package=readability", + "--cover-package=breadability", "--cover-erase", ] diff --git a/tests/test_annotated_text.py b/tests/test_annotated_text.py index fa2db37..1bef627 100644 --- a/tests/test_annotated_text.py +++ b/tests/test_annotated_text.py @@ -1,11 +1,15 @@ # -*- coding: utf8 -*- -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals +) from lxml.html import fragment_fromstring, document_fromstring -from readability.readable import Article -from readability.annotated_text import AnnotatedTextHandler +from breadability.readable import Article +from breadability.annotated_text import AnnotatedTextHandler from .compat import unittest from .utils import load_snippet, load_article diff --git a/tests/test_articles/test_antipope_org/test.py b/tests/test_articles/test_antipope_org/test.py index 29844fa..10db633 100644 --- a/tests/test_articles/test_antipope_org/test.py +++ b/tests/test_articles/test_antipope_org/test.py @@ -5,7 +5,7 @@ from __future__ import division, print_function, unicode_literals import os -from readability.readable import Article +from breadability.readable import Article from ...compat import unittest diff --git a/tests/test_articles/test_businessinsider-com/test.py b/tests/test_articles/test_businessinsider-com/test.py index 0d850be..54d5570 100644 --- a/tests/test_articles/test_businessinsider-com/test.py +++ b/tests/test_articles/test_businessinsider-com/test.py @@ -5,7 +5,7 @@ try: except ImportError: import unittest -from readability.readable import Article +from breadability.readable import Article class TestBusinessInsiderArticle(unittest.TestCase): diff --git a/tests/test_articles/test_cz_zdrojak_tests/test.py b/tests/test_articles/test_cz_zdrojak_tests/test.py index 3b8649b..853ce15 100644 --- a/tests/test_articles/test_cz_zdrojak_tests/test.py +++ b/tests/test_articles/test_cz_zdrojak_tests/test.py @@ -4,8 +4,8 @@ from __future__ import absolute_import from __future__ import division, print_function, unicode_literals from os.path import join, dirname -from readability.readable import Article -from readability._compat import unicode +from breadability.readable import Article +from breadability._compat import unicode from ...compat import unittest diff --git a/tests/test_articles/test_scripting_com/test.py b/tests/test_articles/test_scripting_com/test.py index d0bd917..6b74d52 100644 --- a/tests/test_articles/test_scripting_com/test.py +++ b/tests/test_articles/test_scripting_com/test.py @@ -1,14 +1,18 @@ # -*- coding: utf8 -*- -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals +) import os from operator import attrgetter -from readability.readable import Article -from readability.readable import check_siblings -from readability.readable import prep_article +from breadability.readable import Article +from breadability.readable import check_siblings +from breadability.readable import prep_article from ...compat import unittest @@ -57,7 +61,8 @@ class TestArticle(unittest.TestCase): for node in doc._should_drop: self.assertFalse(node == found.node) - by_score = sorted([c for c in doc.candidates.values()], + by_score = sorted( + [c for c in doc.candidates.values()], key=attrgetter('content_score'), reverse=True) self.assertTrue(by_score[0].node == found.node) diff --git a/tests/test_articles/test_sweetshark/test.py b/tests/test_articles/test_sweetshark/test.py index 7580e05..e4e498c 100644 --- a/tests/test_articles/test_sweetshark/test.py +++ b/tests/test_articles/test_sweetshark/test.py @@ -5,7 +5,7 @@ try: except ImportError: import unittest -from readability.readable import Article +from breadability.readable import Article class TestSweetsharkBlog(unittest.TestCase): diff --git a/tests/test_orig_document.py b/tests/test_orig_document.py index 5a7181d..8d2bcc7 100644 --- a/tests/test_orig_document.py +++ b/tests/test_orig_document.py @@ -4,9 +4,12 @@ from __future__ import absolute_import from __future__ import division, print_function, unicode_literals from collections import defaultdict -from readability._compat import to_unicode, to_bytes -from readability.document import (OriginalDocument, determine_encoding, - convert_breaks_to_paragraphs) +from breadability._compat import to_unicode, to_bytes +from breadability.document import ( + convert_breaks_to_paragraphs, + determine_encoding, + OriginalDocument, +) from .compat import unittest from .utils import load_snippet @@ -18,14 +21,16 @@ class TestOriginalDocument(unittest.TestCase): returned = convert_breaks_to_paragraphs( "

HI

How are you?

\t \n
Fine\n I guess
") - self.assertEqual(returned, + self.assertEqual( + returned, "
HI

How are you?

Fine\n I guess

") def test_convert_hr_tags_to_paragraphs(self): returned = convert_breaks_to_paragraphs( "
HI

How are you?
\t \n
Fine\n I guess
") - self.assertEqual(returned, + self.assertEqual( + returned, "
HI

How are you?

Fine\n I guess

") def test_readin_min_document(self): @@ -79,7 +84,7 @@ class TestOriginalDocument(unittest.TestCase): def test_encoding(self): text = "ľščťžýáíéäúňôůě".encode("iso-8859-2") - encoding = determine_encoding(text) + determine_encoding(text) def test_encoding_short(self): text = "ľščťžýáíé".encode("iso-8859-2") diff --git a/tests/test_readable.py b/tests/test_readable.py index 7b0a574..727c793 100644 --- a/tests/test_readable.py +++ b/tests/test_readable.py @@ -6,14 +6,16 @@ from __future__ import division, print_function, unicode_literals from lxml.etree import tounicode from lxml.html import document_fromstring from lxml.html import fragment_fromstring -from readability._compat import to_unicode -from readability.readable import Article -from readability.readable import get_class_weight -from readability.readable import get_link_density -from readability.readable import is_bad_link -from readability.readable import score_candidates -from readability.readable import leaf_div_elements_into_paragraphs -from readability.scoring import ScoredNode +from breadability._compat import to_unicode +from breadability.readable import ( + Article, + get_class_weight, + get_link_density, + is_bad_link, + leaf_div_elements_into_paragraphs, + score_candidates, +) +from breadability.scoring import ScoredNode from .compat import unittest from .utils import load_snippet, load_article @@ -65,7 +67,6 @@ class TestReadableDocument(unittest.TestCase): self.assertEqual(doc.readable_dom.tag, 'div') self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody') - def test_no_content(self): """Without content we supply an empty unparsed doc.""" doc = Article('') @@ -81,10 +82,11 @@ class TestCleaning(unittest.TestCase): """Verify we wipe out things from our unlikely list.""" doc = Article(load_snippet('test_readable_unlikely.html')) readable = doc.readable_dom - must_not_appear = ['comment', 'community', 'disqus', 'extra', 'foot', - 'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar', - 'sponsor', 'ad-break', 'agegate', 'pagination' '', 'pager', - 'popup', 'tweet', 'twitter', 'imgBlogpostPermalink'] + must_not_appear = [ + 'comment', 'community', 'disqus', 'extra', 'foot', + 'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar', + 'sponsor', 'ad-break', 'agegate', 'pagination' '', 'pager', + 'popup', 'tweet', 'twitter', 'imgBlogpostPermalink'] want_to_appear = ['and', 'article', 'body', 'column', 'main', 'shadow'] @@ -127,9 +129,8 @@ class TestCleaning(unittest.TestCase): '') test_doc2 = document_fromstring(test_html2) self.assertEqual( - tounicode( - leaf_div_elements_into_paragraphs(test_doc2)), - to_unicode('

simplelink

') + tounicode(leaf_div_elements_into_paragraphs(test_doc2)), + to_unicode('

simplelink

') ) def test_dont_transform_div_with_div(self): diff --git a/tests/test_scoring.py b/tests/test_scoring.py index 823987d..80d3462 100644 --- a/tests/test_scoring.py +++ b/tests/test_scoring.py @@ -8,14 +8,18 @@ import re from operator import attrgetter from lxml.html import document_fromstring from lxml.html import fragment_fromstring -from readability.readable import Article -from readability.scoring import check_node_attributes -from readability.scoring import get_class_weight -from readability.scoring import ScoredNode -from readability.scoring import score_candidates -from readability.scoring import generate_hash_id -from readability.readable import get_link_density -from readability.readable import is_unlikely_node +from breadability.readable import Article +from breadability.scoring import ( + check_node_attributes, + generate_hash_id, + get_class_weight, + score_candidates, + ScoredNode, +) +from breadability.readable import ( + get_link_density, + is_unlikely_node, +) from .compat import unittest from .utils import load_snippet @@ -60,7 +64,8 @@ class TestCheckNodeAttr(unittest.TestCase): test_node = fragment_fromstring('
') test_node.set('class', 'test2 comment') - self.assertTrue(check_node_attributes(test_pattern, test_node, 'class')) + self.assertTrue( + check_node_attributes(test_pattern, test_node, 'class')) def test_has_id(self): """Verify that a node has an id in our set.""" @@ -75,7 +80,8 @@ class TestCheckNodeAttr(unittest.TestCase): test_pattern = re.compile('test1|test2', re.I) test_node = fragment_fromstring('
') test_node.set('class', 'test4 comment') - self.assertFalse(check_node_attributes(test_pattern, test_node, 'class')) + self.assertFalse( + check_node_attributes(test_pattern, test_node, 'class')) def test_lacks_id(self): """Verify that a node does not have an id in our set.""" @@ -266,7 +272,8 @@ class TestScoreCandidates(unittest.TestCase): div_nodes = dom.findall(".//div") candidates = score_candidates(div_nodes) - ordered = sorted((c for c in candidates.values()), reverse=True, + ordered = sorted( + (c for c in candidates.values()), reverse=True, key=attrgetter("content_score")) self.assertEqual(ordered[0].node.tag, "div")