Tests migrated into pytest style

pull/35/head
Mišo Belica 6 years ago
parent 48acf389b1
commit aa83825334

@ -35,41 +35,41 @@ TEST_PATH = join(
TEST_TEMPLATE = '''# -*- coding: utf8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
"""
Test the scoring and parsing of the article from URL below:
%(source_url)s
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import pytest
from os.path import join, dirname
from breadability.readable import Article
from ...compat import unittest
class TestArticle(unittest.TestCase):
"""
Test the scoring and parsing of the article from URL below:
%(source_url)s
"""
@pytest.fixture(scope="module")
def article():
"""Load up the article for us"""
article_path = os.path.join(os.path.dirname(__file__), "article.html")
with open(article_path, "rb") as file:
return Article(file.read(), "%(source_url)s")
def setUp(self):
"""Load up the article for us"""
article_path = join(dirname(__file__), "article.html")
with open(article_path, "rb") as file:
self.document = Article(file.read(), "%(source_url)s")
def test_parses(article):
"""Verify we can parse the document."""
assert 'id="readabilityBody"' in article.readable
def tearDown(self):
"""Drop the article"""
self.document = None
def test_parses(self):
"""Verify we can parse the document."""
self.assertIn('id="readabilityBody"', self.document.readable)
def test_content_exists(article):
"""Verify that some content exists."""
assert "#&@#&@#&@" in article.readable
def test_content_exists(self):
"""Verify that some content exists."""
self.assertIn("#&@#&@#&@", self.document.readable)
def test_content_does_not_exist(self):
"""Verify we cleaned out some content that shouldn't exist."""
self.assertNotIn("", self.document.readable)
def test_content_does_not_exist(article):
"""Verify we cleaned out some content that shouldn't exist."""
assert "" not in article.readable
'''

@ -32,9 +32,6 @@ tests_require = [
]
if sys.version_info < (2, 7):
install_requires.append("unittest2")
console_script_targets = [
"breadability = breadability.scripts.client:main",
"breadability-{0} = breadability.scripts.client:main",

@ -2,8 +2,3 @@
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
try:
import unittest2 as unittest
except ImportError:
import unittest

@ -10,164 +10,161 @@ from __future__ import (
from lxml.html import fragment_fromstring, document_fromstring
from breadability.readable import Article
from breadability.annotated_text import AnnotatedTextHandler
from .compat import unittest
from .utils import load_snippet, load_article
class TestAnnotatedText(unittest.TestCase):
def test_simple_document(self):
dom = fragment_fromstring("<p>This is\n\tsimple\ttext.</p>")
annotated_text = AnnotatedTextHandler.parse(dom)
expected = [
(
("This is\nsimple text.", None),
),
]
self.assertEqual(annotated_text, expected)
def test_empty_paragraph(self):
dom = fragment_fromstring("<div><p>Paragraph <p>\t \n</div>")
annotated_text = AnnotatedTextHandler.parse(dom)
expected = [
(
("Paragraph", None),
),
]
self.assertEqual(annotated_text, expected)
def test_multiple_paragraphs(self):
dom = fragment_fromstring("<div><p> 1 first<p> 2\tsecond <p>3\rthird </div>")
annotated_text = AnnotatedTextHandler.parse(dom)
expected = [
(
("1 first", None),
),
(
("2 second", None),
),
(
("3\nthird", None),
),
]
self.assertEqual(annotated_text, expected)
def test_single_annotation(self):
dom = fragment_fromstring("<div><p> text <em>emphasis</em> <p> last</div>")
annotated_text = AnnotatedTextHandler.parse(dom)
expected = [
(
("text", None),
("emphasis", ("em",)),
),
(
("last", None),
),
]
self.assertEqual(annotated_text, expected)
def test_recursive_annotation(self):
dom = fragment_fromstring("<div><p> text <em><i><em>emphasis</em></i></em> <p> last</div>")
annotated_text = AnnotatedTextHandler.parse(dom)
expected = [
(
("text", None),
("emphasis", ("em", "i")),
),
(
("last", None),
),
]
self.assertEqual(annotated_text, expected)
def test_annotations_without_explicit_paragraph(self):
dom = fragment_fromstring("<div>text <strong>emphasis</strong>\t<b>hmm</b> </div>")
annotated_text = AnnotatedTextHandler.parse(dom)
expected = [
(
("text", None),
("emphasis", ("strong",)),
("hmm", ("b",)),
),
]
self.assertEqual(annotated_text, expected)
def test_process_paragraph_with_chunked_text(self):
handler = AnnotatedTextHandler()
paragraph = handler._process_paragraph([
(" 1", ("b", "del")),
(" 2", ("b", "del")),
(" 3", None),
(" 4", None),
(" 5", None),
(" 6", ("em",)),
])
expected = (
("1 2", ("b", "del")),
("3 4 5", None),
("6", ("em",)),
)
self.assertEqual(paragraph, expected)
def test_include_heading(self):
dom = document_fromstring(load_snippet("h1_and_2_paragraphs.html"))
annotated_text = AnnotatedTextHandler.parse(dom.find("body"))
expected = [
(
('Nadpis H1, ktorý chce byť prvý s textom ale predbehol ho "title"', ("h1",)),
("Toto je prvý odstavec a to je fajn.", None),
),
(
("Tento text je tu aby vyplnil prázdne miesto v srdci súboru.\nAj súbory majú predsa city.", None),
),
]
self.assertSequenceEqual(annotated_text, expected)
def test_real_article(self):
article = Article(load_article("zdrojak_automaticke_zabezpeceni.html"))
annotated_text = article.main_text
expected = [
(
("Automatické zabezpečení", ("h1",)),
("Úroveň zabezpečení aplikace bych rozdělil do tří úrovní:", None),
),
(
("Aplikace zabezpečená není, neošetřuje uživatelské vstupy ani své výstupy.", ("li", "ol")),
("Aplikace se o zabezpečení snaží, ale takovým způsobem, že na ně lze zapomenout.", ("li", "ol")),
("Aplikace se o zabezpečení stará sama, prakticky se nedá udělat chyba.", ("li", "ol")),
),
(
("Jak se tyto úrovně projevují v jednotlivých oblastech?", None),
),
(
("XSS", ("a", "h2")),
("Druhou úroveň představuje ruční ošetřování pomocí", None),
("htmlspecialchars", ("a", "kbd")),
(". Třetí úroveň zdánlivě reprezentuje automatické ošetřování v šablonách, např. v", None),
("Nette Latte", ("a", "strong")),
(". Proč píšu zdánlivě? Problém je v tom, že ošetření se dá obvykle snadno zakázat, např. v Latte pomocí", None),
("{!$var}", ("code",)),
(". Viděl jsem šablony plné vykřičníků i na místech, kde být neměly. Autor to vysvětlil tak, že psaní", None),
("{$var}", ("code",)),
("někde způsobovalo problémy, které po přidání vykřičníku zmizely, tak je začal psát všude.", None),
),
(
("<?php\n$safeHtml = $texy->process($content_texy);\n$content = Html::el()->setHtml($safeHtml);\n// v šabloně pak můžeme použít {$content}\n?>", ("pre", )),
),
(
("Ideální by bylo, když by už samotná metoda", None),
("process()", ("code",)),
("vracela instanci", None),
("Html", ("code",)),
(".", None),
),
]
self.assertSequenceEqual(annotated_text, expected)
def test_simple_document():
dom = fragment_fromstring("<p>This is\n\tsimple\ttext.</p>")
annotated_text = AnnotatedTextHandler.parse(dom)
assert annotated_text == [
(
("This is\nsimple text.", None),
),
]
def test_empty_paragraph():
dom = fragment_fromstring("<div><p>Paragraph <p>\t \n</div>")
annotated_text = AnnotatedTextHandler.parse(dom)
assert annotated_text == [
(
("Paragraph", None),
),
]
def test_multiple_paragraphs():
dom = fragment_fromstring("<div><p> 1 first<p> 2\tsecond <p>3\rthird </div>")
annotated_text = AnnotatedTextHandler.parse(dom)
assert annotated_text == [
(
("1 first", None),
),
(
("2 second", None),
),
(
("3\nthird", None),
),
]
def test_single_annotation():
dom = fragment_fromstring("<div><p> text <em>emphasis</em> <p> last</div>")
annotated_text = AnnotatedTextHandler.parse(dom)
assert annotated_text == [
(
("text", None),
("emphasis", ("em",)),
),
(
("last", None),
),
]
def test_recursive_annotation():
dom = fragment_fromstring("<div><p> text <em><i><em>emphasis</em></i></em> <p> last</div>")
annotated_text = AnnotatedTextHandler.parse(dom)
assert annotated_text == [
(
("text", None),
("emphasis", ("em", "i")),
),
(
("last", None),
),
]
def test_annotations_without_explicit_paragraph():
dom = fragment_fromstring("<div>text <strong>emphasis</strong>\t<b>hmm</b> </div>")
annotated_text = AnnotatedTextHandler.parse(dom)
assert annotated_text == [
(
("text", None),
("emphasis", ("strong",)),
("hmm", ("b",)),
),
]
def test_process_paragraph_with_chunked_text():
handler = AnnotatedTextHandler()
paragraph = handler._process_paragraph([
(" 1", ("b", "del")),
(" 2", ("b", "del")),
(" 3", None),
(" 4", None),
(" 5", None),
(" 6", ("em",)),
])
assert paragraph == (
("1 2", ("b", "del")),
("3 4 5", None),
("6", ("em",)),
)
def test_include_heading():
dom = document_fromstring(load_snippet("h1_and_2_paragraphs.html"))
annotated_text = AnnotatedTextHandler.parse(dom.find("body"))
assert annotated_text == [
(
('Nadpis H1, ktorý chce byť prvý s textom ale predbehol ho "title"', ("h1",)),
("Toto je prvý odstavec a to je fajn.", None),
),
(
("Tento text je tu aby vyplnil prázdne miesto v srdci súboru.\nAj súbory majú predsa city.", None),
),
]
def test_real_article():
article = Article(load_article("zdrojak_automaticke_zabezpeceni.html"))
annotated_text = article.main_text
assert annotated_text == [
(
("Automatické zabezpečení", ("h1",)),
("Úroveň zabezpečení aplikace bych rozdělil do tří úrovní:", None),
),
(
("Aplikace zabezpečená není, neošetřuje uživatelské vstupy ani své výstupy.", ("li", "ol")),
("Aplikace se o zabezpečení snaží, ale takovým způsobem, že na ně lze zapomenout.", ("li", "ol")),
("Aplikace se o zabezpečení stará sama, prakticky se nedá udělat chyba.", ("li", "ol")),
),
(
("Jak se tyto úrovně projevují v jednotlivých oblastech?", None),
),
(
("XSS", ("a", "h2")),
("Druhou úroveň představuje ruční ošetřování pomocí", None),
("htmlspecialchars", ("a", "kbd")),
(". Třetí úroveň zdánlivě reprezentuje automatické ošetřování v šablonách, např. v", None),
("Nette Latte", ("a", "strong")),
(". Proč píšu zdánlivě? Problém je v tom, že ošetření se dá obvykle snadno zakázat, např. v Latte pomocí", None),
("{!$var}", ("code",)),
(". Viděl jsem šablony plné vykřičníků i na místech, kde být neměly. Autor to vysvětlil tak, že psaní", None),
("{$var}", ("code",)),
("někde způsobovalo problémy, které po přidání vykřičníku zmizely, tak je začal psát všude.", None),
),
(
("<?php\n$safeHtml = $texy->process($content_texy);\n$content = Html::el()->setHtml($safeHtml);\n// v šabloně pak můžeme použít {$content}\n?>", ("pre", )),
),
(
("Ideální by bylo, když by už samotná metoda", None),
("process()", ("code",)),
("vracela instanci", None),
("Html", ("code",)),
(".", None),
),
]

@ -1,42 +1,45 @@
# -*- coding: utf8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
"""Test the scoring and parsing of the Blog Post"""
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import pytest
from breadability.readable import Article
from ...compat import unittest
class TestAntipopeBlog(unittest.TestCase):
"""Test the scoring and parsing of the Blog Post"""
@pytest.fixture(scope="module")
def article():
"""Load up the article for us"""
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
with open(article_path) as file:
return file.read()
def test_parses(article):
"""Verify we can parse the document."""
doc = Article(article)
assert 'id="readabilityBody"' in doc.readable
def setUp(self):
"""Load up the article for us"""
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
self.article = open(article_path).read()
def test_comments_cleaned(article):
"""The div with the comments should be removed."""
doc = Article(article)
def tearDown(self):
"""Drop the article"""
self.article = None
assert 'class="comments"' not in doc.readable
def test_parses(self):
"""Verify we can parse the document."""
doc = Article(self.article)
self.assertTrue('id="readabilityBody"' in doc.readable)
def test_comments_cleaned(self):
"""The div with the comments should be removed."""
doc = Article(self.article)
self.assertTrue('class="comments"' not in doc.readable)
def test_beta_removed(article):
"""The id=beta element should be removed
def test_beta_removed(self):
"""The id=beta element should be removed
It's link heavy and causing a lot of garbage content. This should be
removed.
It's link heavy and causing a lot of garbage content. This should be
removed.
"""
doc = Article(article)
"""
doc = Article(self.article)
self.assertTrue('id="beta"' not in doc.readable)
assert 'id="beta"' not in doc.readable

@ -1,33 +1,34 @@
# -*- coding: utf8 -*-
"""Test the scoring and parsing of the Blog Post"""
from __future__ import absolute_import, division, print_function, unicode_literals
import os
try:
# Python < 2.7
import unittest2 as unittest
except ImportError:
import unittest
import pytest
from breadability.readable import Article
class TestBusinessInsiderArticle(unittest.TestCase):
"""Test the scoring and parsing of the Blog Post"""
@pytest.fixture(scope="module")
def article():
"""Load up the article for us"""
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
with open(article_path) as file:
return file.read()
def setUp(self):
def test_parses(article):
"""Verify we can parse the document."""
doc = Article(article)
"""Load up the article for us"""
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
self.article = open(article_path).read()
assert 'id="readabilityBody"' in doc.readable
def tearDown(self):
"""Drop the article"""
self.article = None
def test_parses(self):
"""Verify we can parse the document."""
doc = Article(self.article)
self.assertTrue('id="readabilityBody"' in doc.readable)
def test_images_preserved(article):
"""The div with the comments should be removed."""
doc = Article(article)
def test_images_preserved(self):
"""The div with the comments should be removed."""
doc = Article(self.article)
self.assertTrue('bharath-kumar-a-co-founder-at-pugmarksme-suggests-working-on-a-sunday-late-night.jpg' in doc.readable)
self.assertTrue('bryan-guido-hassin-a-university-professor-and-startup-junkie-uses-airplane-days.jpg' in doc.readable)
assert 'bharath-kumar-a-co-founder-at-pugmarksme-suggests-working-on-a-sunday-late-night.jpg' in doc.readable
assert 'bryan-guido-hassin-a-university-professor-and-startup-junkie-uses-airplane-days.jpg' in doc.readable

@ -1,39 +1,33 @@
# -*- coding: utf8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
"""
Test the scoring and parsing of the article from URL below:
http://www.businessinsider.com/tech-ceos-favorite-productivity-hacks-2013-8
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import pytest
from os.path import join, dirname
from breadability.readable import Article
from ...compat import unittest
class TestArticle(unittest.TestCase):
"""
Test the scoring and parsing of the article from URL below:
http://www.businessinsider.com/tech-ceos-favorite-productivity-hacks-2013-8
"""
def setUp(self):
"""Load up the article for us"""
article_path = join(dirname(__file__), "article.html")
with open(article_path, "rb") as file:
self.document = Article(file.read(), "http://www.businessinsider.com/tech-ceos-favorite-productivity-hacks-2013-8")
def tearDown(self):
"""Drop the article"""
self.document = None
def test_parses(self):
"""Verify we can parse the document."""
self.assertIn('id="readabilityBody"', self.document.readable)
def test_images_preserved(self):
"""The div with the comments should be removed."""
images = [
'bharath-kumar-a-co-founder-at-pugmarksme-suggests-working-on-a-sunday-late-night.jpg',
'bryan-guido-hassin-a-university-professor-and-startup-junkie-uses-airplane-days.jpg',
]
for image in images:
self.assertIn(image, self.document.readable, image)
@pytest.fixture(scope="module")
def article():
"""Load up the article for us"""
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
with open(article_path, "rb") as file:
return Article(file.read(), "http://www.businessinsider.com/tech-ceos-favorite-productivity-hacks-2013-8")
def test_parses(article):
"""Verify we can parse the document."""
assert 'id="readabilityBody"' in article.readable
def test_images_preserved(article):
"""The div with the comments should be removed."""
assert 'bharath-kumar-a-co-founder-at-pugmarksme-suggests-working-on-a-sunday-late-night.jpg' in article.readable
assert 'bryan-guido-hassin-a-university-professor-and-startup-junkie-uses-airplane-days.jpg' in article.readable

@ -1,44 +1,44 @@
# -*- coding: utf8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
"""
Test the scoring and parsing of the article from URL below:
http://www.zdrojak.cz/clanky/jeste-k-testovani/
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import pytest
from os.path import join, dirname
from breadability.readable import Article
from breadability._compat import unicode
from ...compat import unittest
from breadability.readable import Article
@pytest.fixture(scope="module")
def article():
"""Load up the article for us"""
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
with open(article_path, "rb") as file:
return Article(file.read(), "http://www.zdrojak.cz/clanky/jeste-k-testovani/")
class TestArticle(unittest.TestCase):
"""
Test the scoring and parsing of the article from URL below:
http://www.zdrojak.cz/clanky/jeste-k-testovani/
"""
def setUp(self):
"""Load up the article for us"""
article_path = join(dirname(__file__), "article.html")
with open(article_path, "rb") as file:
self.document = Article(file.read(), "http://www.zdrojak.cz/clanky/jeste-k-testovani/")
def test_parses(article):
"""Verify we can parse the document."""
assert 'id="readabilityBody"' in article.readable
def tearDown(self):
"""Drop the article"""
self.document = None
def test_parses(self):
"""Verify we can parse the document."""
self.assertIn('id="readabilityBody"', self.document.readable)
def test_content_exists(article):
"""Verify that some content exists."""
assert isinstance(article.readable, unicode)
def test_content_exists(self):
"""Verify that some content exists."""
self.assertIsInstance(self.document.readable, unicode)
text = "S automatizovaným testováním kódu (a ve zbytku článku budu mít na mysli právě to) jsem se setkal v několika firmách."
assert text in article.readable
text = "S automatizovaným testováním kódu (a ve zbytku článku budu mít na mysli právě to) jsem se setkal v několika firmách."
self.assertIn(text, self.document.readable)
text = "Ke čtení naleznete mnoho různých materiálů, od teoretických po praktické ukázky."
assert text in article.readable
text = "Ke čtení naleznete mnoho různých materiálů, od teoretických po praktické ukázky."
self.assertIn(text, self.document.readable)
def test_content_does_not_exist(self):
"""Verify we cleaned out some content that shouldn't exist."""
self.assertNotIn("Pokud vás problematika zajímá, využijte možnosti navštívit školení", self.document.readable)
def test_content_does_not_exist(article):
"""Verify we cleaned out some content that shouldn't exist."""
assert "Pokud vás problematika zajímá, využijte možnosti navštívit školení" not in article.readable

@ -1,74 +1,64 @@
# -*- coding: utf8 -*-
from __future__ import (
absolute_import,
division,
print_function,
unicode_literals
)
"""Test the scoring and parsing of the Article"""
import os
from __future__ import absolute_import, division, print_function, unicode_literals
import os
from operator import attrgetter
from breadability.readable import Article
from breadability.readable import check_siblings
from breadability.readable import prep_article
from ...compat import unittest
class TestArticle(unittest.TestCase):
"""Test the scoring and parsing of the Article"""
def setUp(self):
"""Load up the article for us"""
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
self.article = open(article_path).read()
def tearDown(self):
"""Drop the article"""
self.article = None
def test_parses(self):
"""Verify we can parse the document."""
doc = Article(self.article)
self.assertTrue('id="readabilityBody"' in doc.readable)
def test_content_exists(self):
"""Verify that some content exists."""
doc = Article(self.article)
self.assertTrue('Amazon and Google' in doc.readable)
self.assertFalse('Linkblog updated' in doc.readable)
self.assertFalse(
'#anExampleGoogleDoesntIntendToShareBlogAndItWill' in doc.readable)
@unittest.skip("Test fails because of some weird hash.")
def test_candidates(self):
"""Verify we have candidates."""
doc = Article(self.article)
# from lxml.etree import tounicode
found = False
wanted_hash = '04e46055'
for node in doc.candidates.values():
if node.hash_id == wanted_hash:
found = node
self.assertTrue(found)
# we have the right node, it must be deleted for some reason if it's
# not still there when we need it to be.
# Make sure it's not in our to drop list.
for node in doc._should_drop:
self.assertFalse(node == found.node)
by_score = sorted(
[c for c in doc.candidates.values()],
key=attrgetter('content_score'), reverse=True)
self.assertTrue(by_score[0].node == found.node)
updated_winner = check_siblings(by_score[0], doc.candidates)
updated_winner.node = prep_article(updated_winner.node)
# This article hits up against the img > p conditional filtering
# because of the many .gif images in the content. We've removed that
# rule.
import pytest
from breadability.readable import Article, check_siblings, prep_article
@pytest.fixture(scope="module")
def article():
"""Load up the article for us"""
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
with open(article_path) as file:
return Article(file.read())
def test_parses(article):
"""Verify we can parse the document."""
assert 'id="readabilityBody"' in article.readable
def test_content_exists(article):
"""Verify that some content exists."""
assert 'Amazon and Google' in article.readable
assert not 'Linkblog updated' in article.readable
assert not '#anExampleGoogleDoesntIntendToShareBlogAndItWill' in article.readable
@pytest.mark.skip("Test fails because of some weird hash.")
def test_candidates(article):
"""Verify we have candidates."""
# from lxml.etree import tounicode
found = False
wanted_hash = '04e46055'
for node in article.candidates.values():
if node.hash_id == wanted_hash:
found = node
assert found
# we have the right node, it must be deleted for some reason if it's
# not still there when we need it to be.
# Make sure it's not in our to drop list.
for node in article._should_drop:
assert node != found.node
by_score = sorted(
[c for c in article.candidates.values()],
key=attrgetter('content_score'), reverse=True)
assert by_score[0].node == found.node
updated_winner = check_siblings(by_score[0], article.candidates)
updated_winner.node = prep_article(updated_winner.node)
# This article hits up against the img > p conditional filtering
# because of the many .gif images in the content. We've removed that
# rule.

@ -1,33 +1,32 @@
# -*- coding: utf8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
"""
Test the scoring and parsing of the article from URL below:
http://sweetshark.livejournal.com/11564.html
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import pytest
from os.path import join, dirname
from breadability.readable import Article
from ...compat import unittest
class TestSweetsharkBlog(unittest.TestCase):
"""
Test the scoring and parsing of the article from URL below:
http://sweetshark.livejournal.com/11564.html
"""
@pytest.fixture(scope="module")
def article():
"""Load up the article for us"""
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
with open(article_path, "rb") as file:
return Article(file.read(), "http://sweetshark.livejournal.com/11564.html")
def setUp(self):
"""Load up the article for us"""
article_path = join(dirname(__file__), "article.html")
with open(article_path, "rb") as file:
self.document = Article(file.read(), "http://sweetshark.livejournal.com/11564.html")
def tearDown(self):
"""Drop the article"""
self.document = None
def test_parses(article):
"""Verify we can parse the document."""
assert 'id="readabilityBody"' in article.readable
def test_parses(self):
"""Verify we can parse the document."""
self.assertIn('id="readabilityBody"', self.document.readable)
def test_content_after_video(self):
"""The div with the comments should be removed."""
self.assertIn('Stay hungry, Stay foolish', self.document.readable)
def test_content_after_video(article):
"""The div with the comments should be removed."""
assert 'Stay hungry, Stay foolish' in article.readable

@ -1,5 +1,7 @@
# -*- coding: utf8 -*-
"""Verify we can process html into a document to work off of."""
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
@ -15,88 +17,97 @@ from breadability.document import (
decode_html,
OriginalDocument,
)
from .compat import unittest
from .utils import load_snippet
class TestOriginalDocument(unittest.TestCase):
"""Verify we can process html into a document to work off of."""
def test_convert_br_tags_to_paragraphs(self):
returned = convert_breaks_to_paragraphs(
("<div>HI<br><br>How are you?<br><br> \t \n <br>"
"Fine\n I guess</div>"))
self.assertEqual(
returned,
"<div>HI</p><p>How are you?</p><p>Fine\n I guess</div>")
def test_convert_hr_tags_to_paragraphs(self):
returned = convert_breaks_to_paragraphs(
"<div>HI<br><br>How are you?<hr/> \t \n <br>Fine\n I guess</div>")
self.assertEqual(
returned,
"<div>HI</p><p>How are you?</p><p>Fine\n I guess</div>")
def test_readin_min_document(self):
"""Verify we can read in a min html document"""
doc = OriginalDocument(load_snippet('document_min.html'))
self.assertTrue(to_unicode(doc).startswith('<html>'))
self.assertEqual(doc.title, 'Min Document Title')
def test_readin_with_base_url(self):
"""Passing a url should update links to be absolute links"""
doc = OriginalDocument(
load_snippet('document_absolute_url.html'),
url="http://blog.mitechie.com/test.html")
self.assertTrue(to_unicode(doc).startswith('<html>'))
# find the links on the page and make sure each one starts with out
# base url we told it to use.
links = doc.links
self.assertEqual(len(links), 3)
# we should have two links that start with our blog url
# and one link that starts with amazon
link_counts = defaultdict(int)
for link in links:
if link.get('href').startswith('http://blog.mitechie.com'):
link_counts['blog'] += 1
else:
link_counts['other'] += 1
self.assertEqual(link_counts['blog'], 2)
self.assertEqual(link_counts['other'], 1)
def test_no_br_allowed(self):
"""We convert all <br/> tags to <p> tags"""
doc = OriginalDocument(load_snippet('document_min.html'))
self.assertIsNone(doc.dom.find('.//br'))
def test_empty_title(self):
"""We convert all <br/> tags to <p> tags"""
document = OriginalDocument(
"<html><head><title></title></head><body></body></html>")
self.assertEqual(document.title, "")
def test_title_only_with_tags(self):
"""We convert all <br/> tags to <p> tags"""
document = OriginalDocument(
"<html><head><title><em></em></title></head><body></body></html>")
self.assertEqual(document.title, "")
def test_no_title(self):
"""We convert all <br/> tags to <p> tags"""
document = OriginalDocument("<html><head></head><body></body></html>")
self.assertEqual(document.title, "")
def test_encoding(self):
text = "ľščťžýáíéäúňôůě".encode("iso-8859-2")
html = decode_html(text)
self.assertEqual(type(html), unicode)
def test_encoding_short(self):
text = to_bytes("ľščťžýáíé")
html = decode_html(text)
self.assertEqual(type(html), unicode)
self.assertEqual(html, "ľščťžýáíé")
def test_convert_br_tags_to_paragraphs():
returned = convert_breaks_to_paragraphs(
("<div>HI<br><br>How are you?<br><br> \t \n <br>"
"Fine\n I guess</div>"))
assert returned == "<div>HI</p><p>How are you?</p><p>Fine\n I guess</div>"
def test_convert_hr_tags_to_paragraphs():
returned = convert_breaks_to_paragraphs(
"<div>HI<br><br>How are you?<hr/> \t \n <br>Fine\n I guess</div>")
assert returned == "<div>HI</p><p>How are you?</p><p>Fine\n I guess</div>"
def test_readin_min_document():
"""Verify we can read in a min html document"""
doc = OriginalDocument(load_snippet('document_min.html'))
assert to_unicode(doc).startswith('<html>')
assert doc.title == 'Min Document Title'
def test_readin_with_base_url():
"""Passing a url should update links to be absolute links"""
doc = OriginalDocument(
load_snippet('document_absolute_url.html'),
url="http://blog.mitechie.com/test.html")
assert to_unicode(doc).startswith('<html>')
# find the links on the page and make sure each one starts with out
# base url we told it to use.
links = doc.links
assert len(links) == 3
# we should have two links that start with our blog url
# and one link that starts with amazon
link_counts = defaultdict(int)
for link in links:
if link.get('href').startswith('http://blog.mitechie.com'):
link_counts['blog'] += 1
else:
link_counts['other'] += 1
assert link_counts['blog'] == 2
assert link_counts['other'] == 1
def test_no_br_allowed():
"""We convert all <br/> tags to <p> tags"""
doc = OriginalDocument(load_snippet('document_min.html'))
assert doc.dom.find('.//br') is None
def test_empty_title():
"""We convert all <br/> tags to <p> tags"""
document = OriginalDocument(
"<html><head><title></title></head><body></body></html>")
assert document.title == ""
def test_title_only_with_tags():
"""We convert all <br/> tags to <p> tags"""
document = OriginalDocument(
"<html><head><title><em></em></title></head><body></body></html>")
assert document.title == ""
def test_no_title():
"""We convert all <br/> tags to <p> tags"""
document = OriginalDocument("<html><head></head><body></body></html>")
assert document.title == ""
def test_encoding():
text = "ľščťžýáíéäúňôůě".encode("iso-8859-2")
html = decode_html(text)
assert type(html) is unicode
def test_encoding_short():
text = to_bytes("ľščťžýáíé")
html = decode_html(text)
assert type(html) is unicode
assert html == "ľščťžýáíé"

@ -1,347 +1,352 @@
# -*- coding: utf8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
from __future__ import absolute_import, division, print_function, unicode_literals
import pytest
from lxml.etree import tounicode
from lxml.html import document_fromstring
from lxml.html import fragment_fromstring
from lxml.html import document_fromstring, fragment_fromstring
from breadability._compat import to_unicode
from breadability.readable import (
Article,
get_class_weight,
get_link_density,
is_bad_link,
leaf_div_elements_into_paragraphs,
score_candidates,
)
from breadability.readable import (Article, get_class_weight, get_link_density, is_bad_link,
leaf_div_elements_into_paragraphs, score_candidates, )
from breadability.scoring import ScoredNode
from .compat import unittest
from .utils import load_snippet, load_article
from .utils import load_article, load_snippet
# TestReadableDocument
"""Verify we can process html into a document to work off of."""
class TestReadableDocument(unittest.TestCase):
"""Verify we can process html into a document to work off of."""
def test_load_doc(self):
"""We get back an element tree from our original doc"""
doc = Article(load_snippet('document_min.html'))
# We get back the document as a div tag currently by default.
self.assertEqual(doc.readable_dom.tag, 'div')
def test_load_doc():
"""We get back an element tree from our original doc"""
doc = Article(load_snippet('document_min.html'))
# We get back the document as a div tag currently by default.
def test_title_loads(self):
"""Verify we can fetch the title of the parsed article"""
doc = Article(load_snippet('document_min.html'))
self.assertEqual(
doc._original_document.title,
'Min Document Title'
)
assert doc.readable_dom.tag == 'div'
def test_doc_no_scripts_styles(self):
"""Step #1 remove all scripts from the document"""
doc = Article(load_snippet('document_scripts.html'))
readable = doc.readable_dom
self.assertEqual(readable.findall(".//script"), [])
self.assertEqual(readable.findall(".//style"), [])
self.assertEqual(readable.findall(".//link"), [])
def test_find_body_exists(self):
"""If the document has a body, we store that as the readable html
No sense processing anything other than the body content.
"""
doc = Article(load_snippet('document_min.html'))
self.assertEqual(doc.readable_dom.tag, 'div')
self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody')
def test_body_doesnt_exist(self):
"""If we can't find a body, then we create one.
We build our doc around the rest of the html we parsed.
"""
doc = Article(load_snippet('document_no_body.html'))
self.assertEqual(doc.readable_dom.tag, 'div')
self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody')
def test_bare_content(self):
"""If the document is just pure content, no html tags we should be ok
We build our doc around the rest of the html we parsed.
"""
doc = Article(load_snippet('document_only_content.html'))
self.assertEqual(doc.readable_dom.tag, 'div')
self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody')
def test_no_content(self):
"""Without content we supply an empty unparsed doc."""
doc = Article('')
self.assertEqual(doc.readable_dom.tag, 'div')
self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody')
self.assertEqual(doc.readable_dom.get('class'), 'parsing-error')
class TestCleaning(unittest.TestCase):
"""Test out our cleaning processing we do."""
def test_unlikely_hits(self):
"""Verify we wipe out things from our unlikely list."""
doc = Article(load_snippet('test_readable_unlikely.html'))
readable = doc.readable_dom
must_not_appear = [
'comment', 'community', 'disqus', 'extra', 'foot',
'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar',
'sponsor', 'ad-break', 'agegate', 'pagination' '', 'pager',
'popup', 'tweet', 'twitter', 'imgBlogpostPermalink']
want_to_appear = ['and', 'article', 'body', 'column', 'main', 'shadow']
for i in must_not_appear:
# we cannot find any class or id with this value
by_class = readable.find_class(i)
for test in by_class:
# if it's here it cannot have the must not class without the
# want to appear class
found = False
for cls in test.get('class').split():
if cls in want_to_appear:
found = True
self.assertTrue(found)
by_ids = readable.get_element_by_id(i, False)
if by_ids is not False:
found = False
for ids in test.get('id').split():
if ids in want_to_appear:
found = True
self.assertTrue(found)
def test_misused_divs_transform(self):
"""Verify we replace leaf node divs with p's
They should have the same content, just be a p vs a div
"""
test_html = "<html><body><div>simple</div></body></html>"
test_doc = document_fromstring(test_html)
self.assertEqual(
tounicode(
leaf_div_elements_into_paragraphs(test_doc)),
to_unicode("<html><body><p>simple</p></body></html>")
)
test_html2 = ('<html><body><div>simple<a href="">link</a>'
'</div></body></html>')
test_doc2 = document_fromstring(test_html2)
self.assertEqual(
tounicode(
leaf_div_elements_into_paragraphs(test_doc2)),
to_unicode(
'<html><body><p>simple<a href="">link</a></p></body></html>')
)
def test_title_loads():
"""Verify we can fetch the title of the parsed article"""
doc = Article(load_snippet('document_min.html'))
def test_dont_transform_div_with_div(self):
"""Verify that only child <div> element is replaced by <p>."""
dom = document_fromstring(
"<html><body><div>text<div>child</div>"
"aftertext</div></body></html>"
)
assert doc._original_document.title == 'Min Document Title'
self.assertEqual(
tounicode(
leaf_div_elements_into_paragraphs(dom)),
to_unicode(
"<html><body><div>text<p>child</p>"
"aftertext</div></body></html>"
)
)
def test_bad_links(self):
"""Some links should just not belong."""
bad_links = [
'<a name="amazonAndGoogleHaveMadeAnAudaciousGrabOfNamespaceOnTheInternetAsFarAsICanSeeTheresBeenNoMentionOfThisInTheTechPress">&nbsp;</a>',
'<a href="#amazonAndGoogleHaveMadeAnAudaciousGrabOfNamespaceOnTheInternetAsFarAsICanSeeTheresBeenNoMentionOfThisInTheTechPress"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a>',
'<a href="http://scripting.com/stories/2012/06/15/theTechPressIsOutToLunch.html#anExampleGoogleDoesntIntendToShareBlogAndItWillOnlyBeUsedToPointToBloggerSitesIfYouHaveATumblrOrWordpressBlogYouCantHaveABlogDomainHereIsTheAHrefhttpgtldresulticannorgapplicationresultapplicationstatusapplicationdetails527publicListingaOfGooglesAHrefhttpdropboxscriptingcomdavemiscgoogleblogapplicationhtmlapplicationa"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a>'
]
def test_doc_no_scripts_styles():
"""Step #1 remove all scripts from the document"""
doc = Article(load_snippet('document_scripts.html'))
readable = doc.readable_dom
assert readable.findall(".//script") == []
assert readable.findall(".//style") == []
assert readable.findall(".//link") == []
def test_find_body_exists():
"""If the document has a body, we store that as the readable html
No sense processing anything other than the body content.
"""
doc = Article(load_snippet('document_min.html'))
assert doc.readable_dom.tag == 'div'
assert doc.readable_dom.get('id') == 'readabilityBody'
def test_body_doesnt_exist():
"""If we can't find a body, then we create one.
We build our doc around the rest of the html we parsed.
"""
doc = Article(load_snippet('document_no_body.html'))
assert doc.readable_dom.tag == 'div'
assert doc.readable_dom.get('id') == 'readabilityBody'
def test_bare_content():
"""If the document is just pure content, no html tags we should be ok
We build our doc around the rest of the html we parsed.
"""
doc = Article(load_snippet('document_only_content.html'))
assert doc.readable_dom.tag == 'div'
assert doc.readable_dom.get('id') == 'readabilityBody'
def test_no_content():
"""Without content we supply an empty unparsed doc."""
doc = Article('')
assert doc.readable_dom.tag == 'div'
assert doc.readable_dom.get('id') == 'readabilityBody'
assert doc.readable_dom.get('class') == 'parsing-error'
# Test out our cleaning processing we do.
def test_unlikely_hits():
"""Verify we wipe out things from our unlikely list."""
doc = Article(load_snippet('test_readable_unlikely.html'))
readable = doc.readable_dom
must_not_appear = [
'comment', 'community', 'disqus', 'extra', 'foot',
'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar',
'sponsor', 'ad-break', 'agegate', 'pagination' '', 'pager',
'popup', 'tweet', 'twitter', 'imgBlogpostPermalink']
want_to_appear = ['and', 'article', 'body', 'column', 'main', 'shadow']
for i in must_not_appear:
# we cannot find any class or id with this value
by_class = readable.find_class(i)
for test in by_class:
# if it's here it cannot have the must not class without the
# want to appear class
found = False
for cls in test.get('class').split():
if cls in want_to_appear:
found = True
assert found
by_ids = readable.get_element_by_id(i, False)
if by_ids is not False:
found = False
for ids in test.get('id').split():
if ids in want_to_appear:
found = True
assert found
def test_misused_divs_transform():
"""Verify we replace leaf node divs with p's
They should have the same content, just be a p vs a div
"""
test_html = "<html><body><div>simple</div></body></html>"
test_doc = document_fromstring(test_html)
assert tounicode(leaf_div_elements_into_paragraphs(test_doc)) == to_unicode(
"<html><body><p>simple</p></body></html>"
)
test_html2 = ('<html><body><div>simple<a href="">link</a>'
'</div></body></html>')
test_doc2 = document_fromstring(test_html2)
assert tounicode(leaf_div_elements_into_paragraphs(test_doc2)) == to_unicode(
'<html><body><p>simple<a href="">link</a></p></body></html>'
)
def test_dont_transform_div_with_div():
"""Verify that only child <div> element is replaced by <p>."""
dom = document_fromstring(
"<html><body><div>text<div>child</div>"
"aftertext</div></body></html>"
)
assert tounicode(leaf_div_elements_into_paragraphs(dom)) == to_unicode(
"<html><body><div>text<p>child</p>"
"aftertext</div></body></html>"
)
for l in bad_links:
link = fragment_fromstring(l)
self.assertTrue(is_bad_link(link))
def test_bad_links():
"""Some links should just not belong."""
bad_links = [
'<a name="amazonAndGoogleHaveMadeAnAudaciousGrabOfNamespaceOnTheInternetAsFarAsICanSeeTheresBeenNoMentionOfThisInTheTechPress">&nbsp;</a>',
'<a href="#amazonAndGoogleHaveMadeAnAudaciousGrabOfNamespaceOnTheInternetAsFarAsICanSeeTheresBeenNoMentionOfThisInTheTechPress"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a>',
'<a href="http://scripting.com/stories/2012/06/15/theTechPressIsOutToLunch.html#anExampleGoogleDoesntIntendToShareBlogAndItWillOnlyBeUsedToPointToBloggerSitesIfYouHaveATumblrOrWordpressBlogYouCantHaveABlogDomainHereIsTheAHrefhttpgtldresulticannorgapplicationresultapplicationstatusapplicationdetails527publicListingaOfGooglesAHrefhttpdropboxscriptingcomdavemiscgoogleblogapplicationhtmlapplicationa"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a>'
]
for l in bad_links:
link = fragment_fromstring(l)
assert is_bad_link(link)
class TestCandidateNodes(unittest.TestCase):
"""Candidate nodes are scoring containers we use."""
def test_candidate_scores(self):
"""We should be getting back objects with some scores."""
fives = ['<div/>']
threes = ['<pre/>', '<td/>', '<blockquote/>']
neg_threes = ['<address/>', '<ol/>']
neg_fives = ['<h1/>', '<h2/>', '<h3/>', '<h4/>']
# Candidate nodes are scoring containers we use.
for n in fives:
doc = fragment_fromstring(n)
self.assertEqual(ScoredNode(doc).content_score, 5)
for n in threes:
doc = fragment_fromstring(n)
self.assertEqual(ScoredNode(doc).content_score, 3)
def test_candidate_scores():
"""We should be getting back objects with some scores."""
fives = ['<div/>']
threes = ['<pre/>', '<td/>', '<blockquote/>']
neg_threes = ['<address/>', '<ol/>']
neg_fives = ['<h1/>', '<h2/>', '<h3/>', '<h4/>']
for n in neg_threes:
doc = fragment_fromstring(n)
self.assertEqual(ScoredNode(doc).content_score, -3)
for n in fives:
doc = fragment_fromstring(n)
assert ScoredNode(doc).content_score == 5
for n in neg_fives:
doc = fragment_fromstring(n)
self.assertEqual(ScoredNode(doc).content_score, -5)
for n in threes:
doc = fragment_fromstring(n)
assert ScoredNode(doc).content_score == 3
def test_article_enables_candidate_access(self):
"""Candidates are accessible after document processing."""
doc = Article(load_article('ars.001.html'))
self.assertTrue(hasattr(doc, 'candidates'))
for n in neg_threes:
doc = fragment_fromstring(n)
assert ScoredNode(doc).content_score == -3
for n in neg_fives:
doc = fragment_fromstring(n)
assert ScoredNode(doc).content_score == -5
class TestClassWeights(unittest.TestCase):
"""Certain ids and classes get us bonus points."""
def test_positive_class(self):
"""Some classes get us bonus points."""
node = fragment_fromstring('<p class="article">')
self.assertEqual(get_class_weight(node), 25)
def test_article_enables_candidate_access():
"""Candidates are accessible after document processing."""
doc = Article(load_article('ars.001.html'))
def test_positive_ids(self):
"""Some ids get us bonus points."""
node = fragment_fromstring('<p id="content">')
self.assertEqual(get_class_weight(node), 25)
assert hasattr(doc, 'candidates')
def test_negative_class(self):
"""Some classes get us negative points."""
node = fragment_fromstring('<p class="comment">')
self.assertEqual(get_class_weight(node), -25)
def test_negative_ids(self):
"""Some ids get us negative points."""
node = fragment_fromstring('<p id="media">')
self.assertEqual(get_class_weight(node), -25)
# Certain ids and classes get us bonus points.
class TestScoringNodes(unittest.TestCase):
"""We take out list of potential nodes and score them up."""
def test_positive_class():
"""Some classes get us bonus points."""
node = fragment_fromstring('<p class="article">')
assert get_class_weight(node) == 25
def test_we_get_candidates(self):
"""Processing candidates should get us a list of nodes to try out."""
doc = document_fromstring(load_article("ars.001.html"))
test_nodes = tuple(doc.iter("p", "td", "pre"))
candidates = score_candidates(test_nodes)
# this might change as we tweak our algorithm, but if it does,
# it signifies we need to look at what we changed.
self.assertEqual(len(candidates.keys()), 37)
def test_positive_ids():
"""Some ids get us bonus points."""
node = fragment_fromstring('<p id="content">')
assert get_class_weight(node) == 25
# one of these should have a decent score
scores = sorted(c.content_score for c in candidates.values())
self.assertTrue(scores[-1] > 100)
def test_bonus_score_per_100_chars_in_p(self):
"""Nodes get 1 point per 100 characters up to max. 3 points."""
def build_candidates(length):
html = "<p>%s</p>" % ("c" * length)
node = fragment_fromstring(html)
return [node]
test_nodes = build_candidates(50)
candidates = score_candidates(test_nodes)
pscore_50 = max(c.content_score for c in candidates.values())
test_nodes = build_candidates(100)
candidates = score_candidates(test_nodes)
pscore_100 = max(c.content_score for c in candidates.values())
test_nodes = build_candidates(300)
candidates = score_candidates(test_nodes)
pscore_300 = max(c.content_score for c in candidates.values())
test_nodes = build_candidates(400)
candidates = score_candidates(test_nodes)
pscore_400 = max(c.content_score for c in candidates.values())
self.assertAlmostEqual(pscore_50 + 0.5, pscore_100)
self.assertAlmostEqual(pscore_100 + 2.0, pscore_300)
self.assertAlmostEqual(pscore_300, pscore_400)
class TestLinkDensityScoring(unittest.TestCase):
"""Link density will adjust out candidate scoresself."""
def test_link_density(self):
"""Test that we get a link density"""
doc = document_fromstring(load_article('ars.001.html'))
for node in doc.iter('p', 'td', 'pre'):
density = get_link_density(node)
# the density must be between 0, 1
self.assertTrue(density >= 0.0 and density <= 1.0)
class TestSiblings(unittest.TestCase):
"""Siblings will be included if their content is related."""
@unittest.skip("Not implemented yet.")
def test_bad_siblings_not_counted(self):
raise NotImplementedError()
@unittest.skip("Not implemented yet.")
def test_good_siblings_counted(self):
raise NotImplementedError()
class TestMainText(unittest.TestCase):
def test_empty(self):
article = Article("")
annotated_text = article.main_text
self.assertEqual(annotated_text, [])
def test_no_annotations(self):
article = Article("<div><p>This is text with no annotations</p></div>")
annotated_text = article.main_text
self.assertEqual(annotated_text,
[(("This is text with no annotations", None),)])
def test_one_annotation(self):
article = Article("<div><p>This is text\r\twith <del>no</del> annotations</p></div>")
annotated_text = article.main_text
expected = [(
("This is text\nwith", None),
("no", ("del",)),
("annotations", None),
)]
self.assertEqual(annotated_text, expected)
def test_simple_snippet(self):
snippet = Article(load_snippet("annotated_1.html"))
annotated_text = snippet.main_text
expected = [
(
("Paragraph is more", None),
("better", ("em",)),
(".\nThis text is very", None),
("pretty", ("strong",)),
("'cause she's girl.", None),
),
(
("This is not", None),
("crap", ("big",)),
("so", None),
("readability", ("dfn",)),
("me :)", None),
)
]
self.assertEqual(annotated_text, expected)
def test_negative_class():
"""Some classes get us negative points."""
node = fragment_fromstring('<p class="comment">')
assert get_class_weight(node) == -25
def test_negative_ids():
"""Some ids get us negative points."""
node = fragment_fromstring('<p id="media">')
assert get_class_weight(node) == -25
# We take out list of potential nodes and score them up.
def test_we_get_candidates():
"""Processing candidates should get us a list of nodes to try out."""
doc = document_fromstring(load_article("ars.001.html"))
test_nodes = tuple(doc.iter("p", "td", "pre"))
candidates = score_candidates(test_nodes)
# this might change as we tweak our algorithm, but if it does,
# it signifies we need to look at what we changed.
assert len(candidates.keys()) == 37
# one of these should have a decent score
scores = sorted(c.content_score for c in candidates.values())
assert scores[-1] > 100
def test_bonus_score_per_100_chars_in_p():
"""Nodes get 1 point per 100 characters up to max. 3 points."""
def build_candidates(length):
html = "<p>%s</p>" % ("c" * length)
node = fragment_fromstring(html)
return [node]
test_nodes = build_candidates(50)
candidates = score_candidates(test_nodes)
pscore_50 = max(c.content_score for c in candidates.values())
test_nodes = build_candidates(100)
candidates = score_candidates(test_nodes)
pscore_100 = max(c.content_score for c in candidates.values())
test_nodes = build_candidates(300)
candidates = score_candidates(test_nodes)
pscore_300 = max(c.content_score for c in candidates.values())
test_nodes = build_candidates(400)
candidates = score_candidates(test_nodes)
pscore_400 = max(c.content_score for c in candidates.values())
assert pscore_50 + 0.5 == pscore_100
assert pscore_100 + 2.0 == pscore_300
assert pscore_300 == pscore_400
# Link density will adjust out candidate scoresself.
def test_link_density():
"""Test that we get a link density"""
doc = document_fromstring(load_article('ars.001.html'))
for node in doc.iter('p', 'td', 'pre'):
density = get_link_density(node)
# the density must be between 0, 1
assert density >= 0.0 and density <= 1.0
# Siblings will be included if their content is related.
@pytest.mark.skip("Not implemented yet.")
def test_bad_siblings_not_counted():
raise NotImplementedError()
@pytest.mark.skip("Not implemented yet.")
def test_good_siblings_counted():
raise NotImplementedError()
# TestMainText
def test_empty():
article = Article("")
annotated_text = article.main_text
assert annotated_text == []
def test_no_annotations():
article = Article("<div><p>This is text with no annotations</p></div>")
annotated_text = article.main_text
assert annotated_text == [(("This is text with no annotations", None),)]
def test_one_annotation():
article = Article("<div><p>This is text\r\twith <del>no</del> annotations</p></div>")
annotated_text = article.main_text
assert annotated_text == [(
("This is text\nwith", None),
("no", ("del",)),
("annotations", None),
)]
def test_simple_snippet():
snippet = Article(load_snippet("annotated_1.html"))
annotated_text = snippet.main_text
assert annotated_text == [
(
("Paragraph is more", None),
("better", ("em",)),
(".\nThis text is very", None),
("pretty", ("strong",)),
("'cause she's girl.", None),
),
(
("This is not", None),
("crap", ("big",)),
("so", None),
("readability", ("dfn",)),
("me :)", None),
)
]

@ -1,284 +1,295 @@
# -*- coding: utf8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
from __future__ import absolute_import, division, print_function, unicode_literals
import re
from operator import attrgetter
from lxml.html import document_fromstring
from lxml.html import fragment_fromstring
from breadability.readable import Article
from breadability.scoring import (
check_node_attributes,
generate_hash_id,
get_class_weight,
score_candidates,
ScoredNode,
)
from breadability.readable import (
get_link_density,
is_unlikely_node,
)
from .compat import unittest
from lxml.html import document_fromstring, fragment_fromstring
from breadability.readable import Article, get_link_density, is_unlikely_node
from breadability.scoring import (ScoredNode, check_node_attributes, generate_hash_id, get_class_weight,
score_candidates)
from .utils import load_snippet
class TestHashId(unittest.TestCase):
def test_generate_hash(self):
dom = fragment_fromstring("<div>ľščťžýáí</div>")
generate_hash_id(dom)
def test_generate_hash():
dom = fragment_fromstring("<div>ľščťžýáí</div>")
generate_hash_id(dom)
def test_hash_from_id_on_exception(self):
generate_hash_id(None)
def test_different_hashes(self):
dom = fragment_fromstring("<div>ľščťžýáí</div>")
hash_dom = generate_hash_id(dom)
hash_none = generate_hash_id(None)
def test_hash_from_id_on_exception():
generate_hash_id(None)
self.assertNotEqual(hash_dom, hash_none)
def test_equal_hashes(self):
dom1 = fragment_fromstring("<div>ľščťžýáí</div>")
dom2 = fragment_fromstring("<div>ľščťžýáí</div>")
hash_dom1 = generate_hash_id(dom1)
hash_dom2 = generate_hash_id(dom2)
self.assertEqual(hash_dom1, hash_dom2)
def test_different_hashes():
dom = fragment_fromstring("<div>ľščťžýáí</div>")
hash_dom = generate_hash_id(dom)
hash_none = generate_hash_id(None)
hash_none1 = generate_hash_id(None)
hash_none2 = generate_hash_id(None)
self.assertEqual(hash_none1, hash_none2)
assert hash_dom != hash_none
class TestCheckNodeAttr(unittest.TestCase):
"""Verify a node has a class/id in the given set.
def test_equal_hashes():
dom1 = fragment_fromstring("<div>ľščťžýáí</div>")
dom2 = fragment_fromstring("<div>ľščťžýáí</div>")
hash_dom1 = generate_hash_id(dom1)
hash_dom2 = generate_hash_id(dom2)
assert hash_dom1 == hash_dom2
The idea is that we have sets of known good/bad ids and classes and need
to verify the given node does/doesn't have those classes/ids.
hash_none1 = generate_hash_id(None)
hash_none2 = generate_hash_id(None)
assert hash_none1 == hash_none2
# Verify a node has a class/id in the given set.
# The idea is that we have sets of known good/bad ids and classes and need
# to verify the given node does/doesn't have those classes/ids.
def test_has_class():
"""Verify that a node has a class in our set."""
test_pattern = re.compile('test1|test2', re.I)
test_node = fragment_fromstring('<div/>')
test_node.set('class', 'test2 comment')
assert check_node_attributes(test_pattern, test_node, 'class')
def test_has_id():
"""Verify that a node has an id in our set."""
test_pattern = re.compile('test1|test2', re.I)
test_node = fragment_fromstring('<div/>')
test_node.set('id', 'test2')
assert check_node_attributes(test_pattern, test_node, 'id')
def test_lacks_class():
"""Verify that a node does not have a class in our set."""
test_pattern = re.compile('test1|test2', re.I)
test_node = fragment_fromstring('<div/>')
test_node.set('class', 'test4 comment')
assert not check_node_attributes(test_pattern, test_node, 'class')
def test_lacks_id():
"""Verify that a node does not have an id in our set."""
test_pattern = re.compile('test1|test2', re.I)
test_node = fragment_fromstring('<div/>')
test_node.set('id', 'test4')
assert not check_node_attributes(test_pattern, test_node, 'id')
# Verify we calc our link density correctly.
def test_empty_node():
"""An empty node doesn't have much of a link density"""
doc = Article("<div></div>")
assert get_link_density(doc.readable_dom) == 0.0
def test_small_doc_no_links():
doc = Article(load_snippet('document_min.html'))
assert get_link_density(doc.readable_dom) == 0.0
def test_several_links():
"""This doc has a 3 links with the majority of content."""
doc = Article(load_snippet('document_absolute_url.html'))
assert get_link_density(doc.readable_dom) == 22/37
# Verify we score nodes correctly based on their class/id attributes.
def test_no_matches_zero():
"""If you don't have the attribute then you get a weight of 0"""
node = fragment_fromstring("<div></div>")
assert get_class_weight(node) == 0
"""
def test_has_class(self):
"""Verify that a node has a class in our set."""
test_pattern = re.compile('test1|test2', re.I)
test_node = fragment_fromstring('<div/>')
test_node.set('class', 'test2 comment')
self.assertTrue(
check_node_attributes(test_pattern, test_node, 'class'))
def test_has_id(self):
"""Verify that a node has an id in our set."""
test_pattern = re.compile('test1|test2', re.I)
test_node = fragment_fromstring('<div/>')
test_node.set('id', 'test2')
self.assertTrue(check_node_attributes(test_pattern, test_node, 'id'))
def test_lacks_class(self):
"""Verify that a node does not have a class in our set."""
test_pattern = re.compile('test1|test2', re.I)
test_node = fragment_fromstring('<div/>')
test_node.set('class', 'test4 comment')
self.assertFalse(
check_node_attributes(test_pattern, test_node, 'class'))
def test_lacks_id(self):
"""Verify that a node does not have an id in our set."""
test_pattern = re.compile('test1|test2', re.I)
test_node = fragment_fromstring('<div/>')
test_node.set('id', 'test4')
self.assertFalse(check_node_attributes(test_pattern, test_node, 'id'))
class TestLinkDensity(unittest.TestCase):
"""Verify we calc our link density correctly."""
def test_empty_node(self):
"""An empty node doesn't have much of a link density"""
doc = Article("<div></div>")
self.assertEqual(get_link_density(doc.readable_dom), 0.0)
def test_small_doc_no_links(self):
doc = Article(load_snippet('document_min.html'))
self.assertEqual(get_link_density(doc.readable_dom), 0.0)
def test_several_links(self):
"""This doc has a 3 links with the majority of content."""
doc = Article(load_snippet('document_absolute_url.html'))
self.assertAlmostEqual(get_link_density(doc.readable_dom), 22/37)
class TestClassWeight(unittest.TestCase):
"""Verify we score nodes correctly based on their class/id attributes."""
def test_no_matches_zero(self):
"""If you don't have the attribute then you get a weight of 0"""
node = fragment_fromstring("<div></div>")
self.assertEqual(get_class_weight(node), 0)
def test_id_hits(self):
"""If the id is in the list then it gets a weight"""
test_div = '<div id="post">Content</div>'
node = fragment_fromstring(test_div)
self.assertEqual(get_class_weight(node), 25)
test_div = '<div id="comments">Content</div>'
node = fragment_fromstring(test_div)
self.assertEqual(get_class_weight(node), -25)
def test_class_hits(self):
"""If the class is in the list then it gets a weight"""
test_div = '<div class="something post">Content</div>'
node = fragment_fromstring(test_div)
self.assertEqual(get_class_weight(node), 25)
test_div = '<div class="something comments">Content</div>'
node = fragment_fromstring(test_div)
self.assertEqual(get_class_weight(node), -25)
def test_scores_collide(self):
"""We might hit both positive and negative scores.
Positive and negative scoring is done independently so it's possible
to hit both positive and negative scores and cancel each other out.
"""
test_div = '<div id="post" class="something comment">Content</div>'
node = fragment_fromstring(test_div)
self.assertEqual(get_class_weight(node), 0)
test_div = '<div id="post" class="post comment">Content</div>'
node = fragment_fromstring(test_div)
self.assertEqual(get_class_weight(node), 25)
def test_scores_only_once(self):
"""Scoring is not cumulative within a class hit."""
test_div = '<div class="post main">Content</div>'
node = fragment_fromstring(test_div)
self.assertEqual(get_class_weight(node), 25)
class TestUnlikelyNode(unittest.TestCase):
"""is_unlikely_node should help verify our node is good/bad."""
def test_body_is_always_likely(self):
"""The body tag is always a likely node."""
test_div = '<body class="comment"><div>Content</div></body>'
node = fragment_fromstring(test_div)
self.assertFalse(is_unlikely_node(node))
def test_is_unlikely(self):
"Keywords in the class/id will make us believe this is unlikely."
test_div = '<div class="something comments">Content</div>'
node = fragment_fromstring(test_div)
self.assertTrue(is_unlikely_node(node))
test_div = '<div id="comments">Content</div>'
node = fragment_fromstring(test_div)
self.assertTrue(is_unlikely_node(node))
def test_not_unlikely(self):
"""Suck it double negatives."""
test_div = '<div id="post">Content</div>'
node = fragment_fromstring(test_div)
self.assertFalse(is_unlikely_node(node))
test_div = '<div class="something post">Content</div>'
node = fragment_fromstring(test_div)
self.assertFalse(is_unlikely_node(node))
def test_maybe_hits(self):
"""We've got some maybes that will overrule an unlikely node."""
test_div = '<div id="comments" class="article">Content</div>'
node = fragment_fromstring(test_div)
self.assertFalse(is_unlikely_node(node))
class TestScoredNode(unittest.TestCase):
"""ScoredNodes constructed have initial content_scores, etc."""
def test_hash_id(self):
"""ScoredNodes have a hash_id based on their content
Since this is based on the html there are chances for collisions, but
it helps us follow and identify nodes through the scoring process. Two
identical nodes would score the same, so meh all good.
"""
test_div = '<div id="comments" class="article">Content</div>'
node = fragment_fromstring(test_div)
snode = ScoredNode(node)
self.assertEqual(snode.hash_id, 'ffa4c519')
def test_div_content_score(self):
"""A div starts out with a score of 5 and modifies from there"""
test_div = '<div id="" class="">Content</div>'
node = fragment_fromstring(test_div)
snode = ScoredNode(node)
self.assertEqual(snode.content_score, 5)
test_div = '<div id="article" class="">Content</div>'
node = fragment_fromstring(test_div)
snode = ScoredNode(node)
self.assertEqual(snode.content_score, 30)
test_div = '<div id="comments" class="">Content</div>'
node = fragment_fromstring(test_div)
snode = ScoredNode(node)
self.assertEqual(snode.content_score, -20)
def test_headings_score(self):
"""Heading tags aren't likely candidates, hurt their scores."""
test_div = '<h2>Heading</h2>'
node = fragment_fromstring(test_div)
snode = ScoredNode(node)
self.assertEqual(snode.content_score, -5)
def test_list_items(self):
"""Heading tags aren't likely candidates, hurt their scores."""
test_div = '<li>list item</li>'
node = fragment_fromstring(test_div)
snode = ScoredNode(node)
self.assertEqual(snode.content_score, -3)
class TestScoreCandidates(unittest.TestCase):
"""The grand daddy of tests to make sure our scoring works
Now scoring details will change over time, so the most important thing is
to make sure candidates come out in the right order, not necessarily how
they scored. Make sure to keep this in mind while getting tests going.
def test_id_hits():
"""If the id is in the list then it gets a weight"""
test_div = '<div id="post">Content</div>'
node = fragment_fromstring(test_div)
assert get_class_weight(node) == 25
test_div = '<div id="comments">Content</div>'
node = fragment_fromstring(test_div)
assert get_class_weight(node) == -25
def test_class_hits():
"""If the class is in the list then it gets a weight"""
test_div = '<div class="something post">Content</div>'
node = fragment_fromstring(test_div)
assert get_class_weight(node) == 25
test_div = '<div class="something comments">Content</div>'
node = fragment_fromstring(test_div)
assert get_class_weight(node) == -25
def test_scores_collide():
"""We might hit both positive and negative scores.
Positive and negative scoring is done independently so it's possible
to hit both positive and negative scores and cancel each other out.
"""
test_div = '<div id="post" class="something comment">Content</div>'
node = fragment_fromstring(test_div)
assert get_class_weight(node) == 0
test_div = '<div id="post" class="post comment">Content</div>'
node = fragment_fromstring(test_div)
assert get_class_weight(node) == 25
def test_scores_only_once():
"""Scoring is not cumulative within a class hit."""
test_div = '<div class="post main">Content</div>'
node = fragment_fromstring(test_div)
assert get_class_weight(node) == 25
# is_unlikely_node should help verify our node is good/bad.
def test_simple_candidate_set(self):
"""Tests a simple case of two candidate nodes"""
html = """
<html>
<body>
<div class="content">
<p>This is a great amount of info</p>
<p>And more content <a href="/index">Home</a>
</div>
<div class="footer">
<p>This is a footer</p>
<p>And more content <a href="/index">Home</a>
</div>
</body>
</html>
"""
dom = document_fromstring(html)
div_nodes = dom.findall(".//div")
candidates = score_candidates(div_nodes)
ordered = sorted(
(c for c in candidates.values()), reverse=True,
key=attrgetter("content_score"))
self.assertEqual(ordered[0].node.tag, "div")
self.assertEqual(ordered[0].node.attrib["class"], "content")
self.assertEqual(ordered[1].node.tag, "body")
self.assertEqual(ordered[2].node.tag, "html")
self.assertEqual(ordered[3].node.tag, "div")
self.assertEqual(ordered[3].node.attrib["class"], "footer")
def test_body_is_always_likely():
"""The body tag is always a likely node."""
test_div = '<body class="comment"><div>Content</div></body>'
node = fragment_fromstring(test_div)
assert not is_unlikely_node(node)
def test_is_unlikely():
"""Keywords in the class/id will make us believe this is unlikely."""
test_div = '<div class="something comments">Content</div>'
node = fragment_fromstring(test_div)
assert is_unlikely_node(node)
test_div = '<div id="comments">Content</div>'
node = fragment_fromstring(test_div)
assert is_unlikely_node(node)
def test_not_unlikely():
"""Suck it double negatives."""
test_div = '<div id="post">Content</div>'
node = fragment_fromstring(test_div)
assert not is_unlikely_node(node)
test_div = '<div class="something post">Content</div>'
node = fragment_fromstring(test_div)
assert not is_unlikely_node(node)
def test_maybe_hits():
"""We've got some maybes that will overrule an unlikely node."""
test_div = '<div id="comments" class="article">Content</div>'
node = fragment_fromstring(test_div)
assert not is_unlikely_node(node)
# ScoredNodes constructed have initial content_scores, etc.
def test_hash_id():
"""ScoredNodes have a hash_id based on their content
Since this is based on the html there are chances for collisions, but
it helps us follow and identify nodes through the scoring process. Two
identical nodes would score the same, so meh all good.
"""
test_div = '<div id="comments" class="article">Content</div>'
node = fragment_fromstring(test_div)
snode = ScoredNode(node)
assert snode.hash_id == 'ffa4c519'
def test_div_content_score():
"""A div starts out with a score of 5 and modifies from there"""
test_div = '<div id="" class="">Content</div>'
node = fragment_fromstring(test_div)
snode = ScoredNode(node)
assert snode.content_score == 5
test_div = '<div id="article" class="">Content</div>'
node = fragment_fromstring(test_div)
snode = ScoredNode(node)
assert snode.content_score == 30
test_div = '<div id="comments" class="">Content</div>'
node = fragment_fromstring(test_div)
snode = ScoredNode(node)
assert snode.content_score == -20
def test_headings_score():
"""Heading tags aren't likely candidates, hurt their scores."""
test_div = '<h2>Heading</h2>'
node = fragment_fromstring(test_div)
snode = ScoredNode(node)
assert snode.content_score == -5
def test_list_items():
"""Heading tags aren't likely candidates, hurt their scores."""
test_div = '<li>list item</li>'
node = fragment_fromstring(test_div)
snode = ScoredNode(node)
assert snode.content_score == -3
# The grand daddy of tests to make sure our scoring works
# Now scoring details will change over time, so the most important thing is
# to make sure candidates come out in the right order, not necessarily how
# they scored. Make sure to keep this in mind while getting tests going.
def test_simple_candidate_set():
"""Tests a simple case of two candidate nodes"""
html = """
<html>
<body>
<div class="content">
<p>This is a great amount of info</p>
<p>And more content <a href="/index">Home</a>
</div>
<div class="footer">
<p>This is a footer</p>
<p>And more content <a href="/index">Home</a>
</div>
</body>
</html>
"""
dom = document_fromstring(html)
div_nodes = dom.findall(".//div")
candidates = score_candidates(div_nodes)
ordered = sorted(
(c for c in candidates.values()), reverse=True,
key=attrgetter("content_score"))
assert ordered[0].node.tag == "div"
assert ordered[0].node.attrib["class"] == "content"
assert ordered[1].node.tag == "body"
assert ordered[2].node.tag == "html"
assert ordered[3].node.tag == "div"
assert ordered[3].node.attrib["class"] == "footer"

Loading…
Cancel
Save