This is a great amount of info
-And more content Home -
diff --git a/breadability/scripts/test_helper.py b/breadability/scripts/test_helper.py index b8266bf..7d79d37 100644 --- a/breadability/scripts/test_helper.py +++ b/breadability/scripts/test_helper.py @@ -35,41 +35,41 @@ TEST_PATH = join( TEST_TEMPLATE = '''# -*- coding: utf8 -*- -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals +""" +Test the scoring and parsing of the article from URL below: +%(source_url)s +""" + +from __future__ import absolute_import, division, print_function, unicode_literals + +import os + +import pytest -from os.path import join, dirname from breadability.readable import Article -from ...compat import unittest -class TestArticle(unittest.TestCase): - """ - Test the scoring and parsing of the article from URL below: - %(source_url)s - """ +@pytest.fixture(scope="module") +def article(): + """Load up the article for us""" + article_path = os.path.join(os.path.dirname(__file__), "article.html") + with open(article_path, "rb") as file: + return Article(file.read(), "%(source_url)s") + - def setUp(self): - """Load up the article for us""" - article_path = join(dirname(__file__), "article.html") - with open(article_path, "rb") as file: - self.document = Article(file.read(), "%(source_url)s") +def test_parses(article): + """Verify we can parse the document.""" + assert 'id="readabilityBody"' in article.readable - def tearDown(self): - """Drop the article""" - self.document = None - def test_parses(self): - """Verify we can parse the document.""" - self.assertIn('id="readabilityBody"', self.document.readable) +def test_content_exists(article): + """Verify that some content exists.""" + assert "#&@#&@#&@" in article.readable - def test_content_exists(self): - """Verify that some content exists.""" - self.assertIn("#&@#&@#&@", self.document.readable) - def test_content_does_not_exist(self): - """Verify we cleaned out some content that shouldn't exist.""" - self.assertNotIn("", self.document.readable) +def test_content_does_not_exist(article): + """Verify we cleaned out some content that shouldn't exist.""" + assert "" not in article.readable ''' diff --git a/setup.py b/setup.py index 7ec4f89..0239192 100644 --- a/setup.py +++ b/setup.py @@ -32,9 +32,6 @@ tests_require = [ ] -if sys.version_info < (2, 7): - install_requires.append("unittest2") - console_script_targets = [ "breadability = breadability.scripts.client:main", "breadability-{0} = breadability.scripts.client:main", diff --git a/tests/compat.py b/tests/compat.py index 0c6f910..6b2f5e0 100644 --- a/tests/compat.py +++ b/tests/compat.py @@ -2,8 +2,3 @@ from __future__ import absolute_import from __future__ import division, print_function, unicode_literals - -try: - import unittest2 as unittest -except ImportError: - import unittest diff --git a/tests/test_annotated_text.py b/tests/test_annotated_text.py index 1bef627..a417c94 100644 --- a/tests/test_annotated_text.py +++ b/tests/test_annotated_text.py @@ -10,164 +10,161 @@ from __future__ import ( from lxml.html import fragment_fromstring, document_fromstring from breadability.readable import Article from breadability.annotated_text import AnnotatedTextHandler -from .compat import unittest from .utils import load_snippet, load_article -class TestAnnotatedText(unittest.TestCase): - def test_simple_document(self): - dom = fragment_fromstring("
This is\n\tsimple\ttext.
") - annotated_text = AnnotatedTextHandler.parse(dom) - - expected = [ - ( - ("This is\nsimple text.", None), - ), - ] - self.assertEqual(annotated_text, expected) - - def test_empty_paragraph(self): - dom = fragment_fromstring("Paragraph
\t \n
1 first
2\tsecond
3\rthird
text emphasis
last
text emphasis
last
This is\n\tsimple\ttext.
") + annotated_text = AnnotatedTextHandler.parse(dom) + + assert annotated_text == [ + ( + ("This is\nsimple text.", None), + ), + ] + + +def test_empty_paragraph(): + dom = fragment_fromstring("Paragraph
\t \n
1 first
2\tsecond
3\rthird
text emphasis
last
text emphasis
last
How are you?
Fine\n I guess
How are you?
Fine\n I guess
tags"""
- doc = OriginalDocument(load_snippet('document_min.html'))
- self.assertIsNone(doc.dom.find('.//br'))
-
- def test_empty_title(self):
- """We convert all
tags to
tags""" - document = OriginalDocument( - "
tags""" - document = OriginalDocument( - "
tags""" - document = OriginalDocument("
") - self.assertEqual(document.title, "") - - def test_encoding(self): - text = "ľščťžýáíéäúňôůě".encode("iso-8859-2") - html = decode_html(text) - self.assertEqual(type(html), unicode) - - def test_encoding_short(self): - text = to_bytes("ľščťžýáíé") - html = decode_html(text) - self.assertEqual(type(html), unicode) - self.assertEqual(html, "ľščťžýáíé") +def test_convert_br_tags_to_paragraphs(): + returned = convert_breaks_to_paragraphs( + ("How are you?
Fine\n I guess
How are you?
Fine\n I guess
tags"""
+ doc = OriginalDocument(load_snippet('document_min.html'))
+
+ assert doc.dom.find('.//br') is None
+
+
+def test_empty_title():
+ """We convert all
tags to
tags""" + document = OriginalDocument( + "
tags""" + document = OriginalDocument( + "
tags""" + document = OriginalDocument("
") + + assert document.title == "" + + +def test_encoding(): + text = "ľščťžýáíéäúňôůě".encode("iso-8859-2") + html = decode_html(text) + + assert type(html) is unicode + + +def test_encoding_short(): + text = to_bytes("ľščťžýáíé") + html = decode_html(text) + + assert type(html) is unicode + assert html == "ľščťžýáíé" diff --git a/tests/test_readable.py b/tests/test_readable.py index 483bb6f..bba0ffb 100644 --- a/tests/test_readable.py +++ b/tests/test_readable.py @@ -1,347 +1,352 @@ # -*- coding: utf8 -*- -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals +from __future__ import absolute_import, division, print_function, unicode_literals +import pytest from lxml.etree import tounicode -from lxml.html import document_fromstring -from lxml.html import fragment_fromstring +from lxml.html import document_fromstring, fragment_fromstring + from breadability._compat import to_unicode -from breadability.readable import ( - Article, - get_class_weight, - get_link_density, - is_bad_link, - leaf_div_elements_into_paragraphs, - score_candidates, -) +from breadability.readable import (Article, get_class_weight, get_link_density, is_bad_link, + leaf_div_elements_into_paragraphs, score_candidates, ) from breadability.scoring import ScoredNode -from .compat import unittest -from .utils import load_snippet, load_article +from .utils import load_article, load_snippet +# TestReadableDocument +"""Verify we can process html into a document to work off of.""" -class TestReadableDocument(unittest.TestCase): - """Verify we can process html into a document to work off of.""" - def test_load_doc(self): - """We get back an element tree from our original doc""" - doc = Article(load_snippet('document_min.html')) - # We get back the document as a div tag currently by default. - self.assertEqual(doc.readable_dom.tag, 'div') +def test_load_doc(): + """We get back an element tree from our original doc""" + doc = Article(load_snippet('document_min.html')) + # We get back the document as a div tag currently by default. - def test_title_loads(self): - """Verify we can fetch the title of the parsed article""" - doc = Article(load_snippet('document_min.html')) - self.assertEqual( - doc._original_document.title, - 'Min Document Title' - ) + assert doc.readable_dom.tag == 'div' - def test_doc_no_scripts_styles(self): - """Step #1 remove all scripts from the document""" - doc = Article(load_snippet('document_scripts.html')) - readable = doc.readable_dom - self.assertEqual(readable.findall(".//script"), []) - self.assertEqual(readable.findall(".//style"), []) - self.assertEqual(readable.findall(".//link"), []) - - def test_find_body_exists(self): - """If the document has a body, we store that as the readable html - - No sense processing anything other than the body content. - - """ - doc = Article(load_snippet('document_min.html')) - self.assertEqual(doc.readable_dom.tag, 'div') - self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody') - - def test_body_doesnt_exist(self): - """If we can't find a body, then we create one. - - We build our doc around the rest of the html we parsed. - - """ - doc = Article(load_snippet('document_no_body.html')) - self.assertEqual(doc.readable_dom.tag, 'div') - self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody') - - def test_bare_content(self): - """If the document is just pure content, no html tags we should be ok - - We build our doc around the rest of the html we parsed. - - """ - doc = Article(load_snippet('document_only_content.html')) - self.assertEqual(doc.readable_dom.tag, 'div') - self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody') - - def test_no_content(self): - """Without content we supply an empty unparsed doc.""" - doc = Article('') - self.assertEqual(doc.readable_dom.tag, 'div') - self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody') - self.assertEqual(doc.readable_dom.get('class'), 'parsing-error') - - -class TestCleaning(unittest.TestCase): - """Test out our cleaning processing we do.""" - - def test_unlikely_hits(self): - """Verify we wipe out things from our unlikely list.""" - doc = Article(load_snippet('test_readable_unlikely.html')) - readable = doc.readable_dom - must_not_appear = [ - 'comment', 'community', 'disqus', 'extra', 'foot', - 'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar', - 'sponsor', 'ad-break', 'agegate', 'pagination' '', 'pager', - 'popup', 'tweet', 'twitter', 'imgBlogpostPermalink'] - - want_to_appear = ['and', 'article', 'body', 'column', 'main', 'shadow'] - - for i in must_not_appear: - # we cannot find any class or id with this value - by_class = readable.find_class(i) - - for test in by_class: - # if it's here it cannot have the must not class without the - # want to appear class - found = False - for cls in test.get('class').split(): - if cls in want_to_appear: - found = True - self.assertTrue(found) - - by_ids = readable.get_element_by_id(i, False) - if by_ids is not False: - found = False - for ids in test.get('id').split(): - if ids in want_to_appear: - found = True - self.assertTrue(found) - - def test_misused_divs_transform(self): - """Verify we replace leaf node divs with p's - - They should have the same content, just be a p vs a div - - """ - test_html = "simple
") - ) - test_html2 = ('simplelink
') - ) +def test_title_loads(): + """Verify we can fetch the title of the parsed article""" + doc = Article(load_snippet('document_min.html')) - def test_dont_transform_div_with_div(self): - """Verify that only child.""" - dom = document_fromstring( - "
child
" - "aftertextsimple
" + ) + + test_html2 = ('simplelink
' + ) + + +def test_dont_transform_div_with_div(): + """Verify that only child.""" + dom = document_fromstring( + "
child
" + "aftertext') - self.assertEqual(get_class_weight(node), 25) +def test_article_enables_candidate_access(): + """Candidates are accessible after document processing.""" + doc = Article(load_article('ars.001.html')) - def test_positive_ids(self): - """Some ids get us bonus points.""" - node = fragment_fromstring('
') - self.assertEqual(get_class_weight(node), 25) + assert hasattr(doc, 'candidates') - def test_negative_class(self): - """Some classes get us negative points.""" - node = fragment_fromstring('
') - self.assertEqual(get_class_weight(node), -25) - def test_negative_ids(self): - """Some ids get us negative points.""" - node = fragment_fromstring('
') - self.assertEqual(get_class_weight(node), -25) +# Certain ids and classes get us bonus points. -class TestScoringNodes(unittest.TestCase): - """We take out list of potential nodes and score them up.""" +def test_positive_class(): + """Some classes get us bonus points.""" + node = fragment_fromstring('
') + assert get_class_weight(node) == 25 - def test_we_get_candidates(self): - """Processing candidates should get us a list of nodes to try out.""" - doc = document_fromstring(load_article("ars.001.html")) - test_nodes = tuple(doc.iter("p", "td", "pre")) - candidates = score_candidates(test_nodes) - # this might change as we tweak our algorithm, but if it does, - # it signifies we need to look at what we changed. - self.assertEqual(len(candidates.keys()), 37) +def test_positive_ids(): + """Some ids get us bonus points.""" + node = fragment_fromstring('
') + assert get_class_weight(node) == 25 - # one of these should have a decent score - scores = sorted(c.content_score for c in candidates.values()) - self.assertTrue(scores[-1] > 100) - - def test_bonus_score_per_100_chars_in_p(self): - """Nodes get 1 point per 100 characters up to max. 3 points.""" - def build_candidates(length): - html = "
%s
" % ("c" * length) - node = fragment_fromstring(html) - - return [node] - - test_nodes = build_candidates(50) - candidates = score_candidates(test_nodes) - pscore_50 = max(c.content_score for c in candidates.values()) - - test_nodes = build_candidates(100) - candidates = score_candidates(test_nodes) - pscore_100 = max(c.content_score for c in candidates.values()) - - test_nodes = build_candidates(300) - candidates = score_candidates(test_nodes) - pscore_300 = max(c.content_score for c in candidates.values()) - - test_nodes = build_candidates(400) - candidates = score_candidates(test_nodes) - pscore_400 = max(c.content_score for c in candidates.values()) - - self.assertAlmostEqual(pscore_50 + 0.5, pscore_100) - self.assertAlmostEqual(pscore_100 + 2.0, pscore_300) - self.assertAlmostEqual(pscore_300, pscore_400) - - -class TestLinkDensityScoring(unittest.TestCase): - """Link density will adjust out candidate scoresself.""" - - def test_link_density(self): - """Test that we get a link density""" - doc = document_fromstring(load_article('ars.001.html')) - for node in doc.iter('p', 'td', 'pre'): - density = get_link_density(node) - - # the density must be between 0, 1 - self.assertTrue(density >= 0.0 and density <= 1.0) - - -class TestSiblings(unittest.TestCase): - """Siblings will be included if their content is related.""" - - @unittest.skip("Not implemented yet.") - def test_bad_siblings_not_counted(self): - raise NotImplementedError() - - @unittest.skip("Not implemented yet.") - def test_good_siblings_counted(self): - raise NotImplementedError() - - -class TestMainText(unittest.TestCase): - def test_empty(self): - article = Article("") - annotated_text = article.main_text - - self.assertEqual(annotated_text, []) - - def test_no_annotations(self): - article = Article("This is text with no annotations
This is text\r\twith no annotations
') + assert get_class_weight(node) == -25 + + +def test_negative_ids(): + """Some ids get us negative points.""" + node = fragment_fromstring('
') + assert get_class_weight(node) == -25 + + +# We take out list of potential nodes and score them up. + + +def test_we_get_candidates(): + """Processing candidates should get us a list of nodes to try out.""" + doc = document_fromstring(load_article("ars.001.html")) + test_nodes = tuple(doc.iter("p", "td", "pre")) + candidates = score_candidates(test_nodes) + + # this might change as we tweak our algorithm, but if it does, + # it signifies we need to look at what we changed. + assert len(candidates.keys()) == 37 + + # one of these should have a decent score + scores = sorted(c.content_score for c in candidates.values()) + assert scores[-1] > 100 + + +def test_bonus_score_per_100_chars_in_p(): + """Nodes get 1 point per 100 characters up to max. 3 points.""" + def build_candidates(length): + html = "
%s
" % ("c" * length) + node = fragment_fromstring(html) + + return [node] + + test_nodes = build_candidates(50) + candidates = score_candidates(test_nodes) + pscore_50 = max(c.content_score for c in candidates.values()) + + test_nodes = build_candidates(100) + candidates = score_candidates(test_nodes) + pscore_100 = max(c.content_score for c in candidates.values()) + + test_nodes = build_candidates(300) + candidates = score_candidates(test_nodes) + pscore_300 = max(c.content_score for c in candidates.values()) + + test_nodes = build_candidates(400) + candidates = score_candidates(test_nodes) + pscore_400 = max(c.content_score for c in candidates.values()) + + assert pscore_50 + 0.5 == pscore_100 + assert pscore_100 + 2.0 == pscore_300 + assert pscore_300 == pscore_400 + + +# Link density will adjust out candidate scoresself. + + +def test_link_density(): + """Test that we get a link density""" + doc = document_fromstring(load_article('ars.001.html')) + for node in doc.iter('p', 'td', 'pre'): + density = get_link_density(node) + + # the density must be between 0, 1 + assert density >= 0.0 and density <= 1.0 + + +# Siblings will be included if their content is related. + + +@pytest.mark.skip("Not implemented yet.") +def test_bad_siblings_not_counted(): + raise NotImplementedError() + + +@pytest.mark.skip("Not implemented yet.") +def test_good_siblings_counted(): + raise NotImplementedError() + + +# TestMainText + +def test_empty(): + article = Article("") + annotated_text = article.main_text + + assert annotated_text == [] + + +def test_no_annotations(): + article = Article("This is text with no annotations
This is text\r\twith no annotations
This is a great amount of info
-And more content Home -
This is a great amount of info
+And more content Home +