Make sure we return body with our css class on it

pull/4/merge
Richard Harding 12 years ago
parent e93a52a748
commit 7960264c3b

@ -1,23 +1,37 @@
from breadability.document import OriginalDocument
from breadability.utils import cached_property
def drop_tag(doc, *tags):
[[n.drop_tree() for n in doc.iterfind(".//" + tag)]
for tag in tags]
return doc
def build_base_document(html):
"""Return a base document with the body as root.
html should be a parsed Element object.
"""
found_body = html.find('.//body')
if found_body is not None:
# remove any CSS and set our own
found_body.set('class', 'readabilityBody')
return found_body
class Article(object):
"""Parsed readable object"""
def __init__(self, html, url=None):
self.orig = OriginalDocument(html, url=url)
@cached_property(ttl=600)
def readable(self):
"""The readable parsed article"""
doc = self.orig.html
doc = build_base_document(doc)
doc = drop_tag(doc, 'script', 'link', 'style', 'noscript')
return doc

@ -11,7 +11,8 @@ class TestOriginalDocument(TestCase):
def test_load_doc(self):
"""We get back an element tree from our original doc"""
doc = Article(load_snippet('document_min.html'))
self.assertEqual(doc.readable.tag, 'html')
# We get back the document as a body tag currently by default.
self.assertEqual(doc.readable.tag, 'body')
def test_doc_no_scripts_styles(self):
"""Step #1 remove all scripts from the document"""
@ -20,3 +21,33 @@ class TestOriginalDocument(TestCase):
self.assertEqual(readable.findall(".//script"), [])
self.assertEqual(readable.findall(".//style"), [])
self.assertEqual(readable.findall(".//link"), [])
def test_find_body_exists(self):
"""If the document has a body, we store that as the readable html
No sense processing anything other than the body content.
"""
doc = Article(load_snippet('document_min.html'))
self.assertEqual(doc.readable.tag, 'body')
self.assertEqual(doc.readable.get('class'), 'readabilityBody')
def test_body_doesnt_exist(self):
"""If we can't find a body, then we create one.
We build our doc around the rest of the html we parsed.
"""
doc = Article(load_snippet('document_no_body.html'))
self.assertEqual(doc.readable.tag, 'body')
self.assertEqual(doc.readable.get('class'), 'readabilityBody')
def test_bare_content(self):
"""If the document is just pure content, no html tags we should be ok
We build our doc around the rest of the html we parsed.
"""
doc = Article(load_snippet('document_only_content.html'))
self.assertEqual(doc.readable.tag, 'body')
self.assertEqual(doc.readable.get('class'), 'readabilityBody')

Loading…
Cancel
Save