Make sure we return body with our css class on it

12 years ago · 7960264c3b
parent e93a52a748
commit 7960264c3b
2 changed files with 47 additions and 2 deletions
--- a/src/breadability/readable.py
+++ b/src/breadability/readable.py
@ -1,23 +1,37 @@
 from breadability.document import OriginalDocument
 from breadability.utils import cached_property

+
 def drop_tag(doc, *tags):
    [[n.drop_tree() for n in doc.iterfind(".//" + tag)]
            for tag in tags]
    return doc


+def build_base_document(html):
+    """Return a base document with the body as root.
+
+    html should be a parsed Element object.
+
+    """
+    found_body = html.find('.//body')
+    if found_body is not None:
+        # remove any CSS and set our own
+        found_body.set('class', 'readabilityBody')
+        return found_body
+
+
 class Article(object):
    """Parsed readable object"""

    def __init__(self, html, url=None):
        self.orig = OriginalDocument(html, url=url)

-
    @cached_property(ttl=600)
    def readable(self):
        """The readable parsed article"""
        doc = self.orig.html
+        doc = build_base_document(doc)
        doc = drop_tag(doc, 'script', 'link', 'style', 'noscript')
        return doc

--- a/src/breadability/tests/test_readable.py
+++ b/src/breadability/tests/test_readable.py
@ -11,7 +11,8 @@ class TestOriginalDocument(TestCase):
    def test_load_doc(self):
        """We get back an element tree from our original doc"""
        doc = Article(load_snippet('document_min.html'))
-        self.assertEqual(doc.readable.tag, 'html')
+        # We get back the document as a body tag currently by default.
+        self.assertEqual(doc.readable.tag, 'body')

    def test_doc_no_scripts_styles(self):
        """Step #1 remove all scripts from the document"""
@ -20,3 +21,33 @@ class TestOriginalDocument(TestCase):
        self.assertEqual(readable.findall(".//script"), [])
        self.assertEqual(readable.findall(".//style"), [])
        self.assertEqual(readable.findall(".//link"), [])
+
+    def test_find_body_exists(self):
+        """If the document has a body, we store that as the readable html
+
+        No sense processing anything other than the body content.
+
+        """
+        doc = Article(load_snippet('document_min.html'))
+        self.assertEqual(doc.readable.tag, 'body')
+        self.assertEqual(doc.readable.get('class'), 'readabilityBody')
+
+    def test_body_doesnt_exist(self):
+        """If we can't find a body, then we create one.
+
+        We build our doc around the rest of the html we parsed.
+
+        """
+        doc = Article(load_snippet('document_no_body.html'))
+        self.assertEqual(doc.readable.tag, 'body')
+        self.assertEqual(doc.readable.get('class'), 'readabilityBody')
+
+    def test_bare_content(self):
+        """If the document is just pure content, no html tags we should be ok
+
+        We build our doc around the rest of the html we parsed.
+
+        """
+        doc = Article(load_snippet('document_only_content.html'))
+        self.assertEqual(doc.readable.tag, 'body')
+        self.assertEqual(doc.readable.get('class'), 'readabilityBody')