Start to add some processing for the readable contnet

- Add removal of style, script, etc bits in the content
pull/4/merge
Richard Harding 12 years ago
parent 2e7fb0aa89
commit e93a52a748

@ -1,10 +1,23 @@
from breadable.document import OriginalDocument
from breadability.document import OriginalDocument
from breadability.utils import cached_property
def drop_tag(doc, *tags):
[[n.drop_tree() for n in doc.iterfind(".//" + tag)]
for tag in tags]
return doc
class Article(object):
"""Parsed readable object"""
def __init__(self):
pass
def __init__(self, html, url=None):
self.orig = OriginalDocument(html, url=url)
@cached_property(ttl=600)
def readable(self):
"""The readable parsed article"""
doc = self.orig.html
doc = drop_tag(doc, 'script', 'link', 'style', 'noscript')
return doc

@ -0,0 +1,9 @@
from os import path
TEST_DIR = path.dirname(__file__)
def load_snippet(filename):
"""Helper to fetch in the content of a test snippet"""
return open(path.join(TEST_DIR, 'test_snippets', filename)).read()

@ -1,19 +1,12 @@
from collections import defaultdict
from os import path
from unittest import TestCase
from breadability.document import OriginalDocument
TEST_DIR = path.dirname(__file__)
def load_snippet(filename):
"""Helper to fetch in the content of a test snippet"""
return open(path.join(TEST_DIR, 'test_snippets', filename)).read()
from breadability.tests import load_snippet
class TestOriginalDocuemtn(TestCase):
"""Verify we can process html into a document to work off of."""
def test_readin_min_document(self):
@ -44,5 +37,3 @@ class TestOriginalDocuemtn(TestCase):
self.assertEqual(link_counts['blog'], 2)
self.assertEqual(link_counts['other'], 1)

@ -0,0 +1,22 @@
from collections import defaultdict
from unittest import TestCase
from breadability.readable import Article
from breadability.tests import load_snippet
class TestOriginalDocument(TestCase):
"""Verify we can process html into a document to work off of."""
def test_load_doc(self):
"""We get back an element tree from our original doc"""
doc = Article(load_snippet('document_min.html'))
self.assertEqual(doc.readable.tag, 'html')
def test_doc_no_scripts_styles(self):
"""Step #1 remove all scripts from the document"""
doc = Article(load_snippet('document_scripts.html'))
readable = doc.readable
self.assertEqual(readable.findall(".//script"), [])
self.assertEqual(readable.findall(".//style"), [])
self.assertEqual(readable.findall(".//link"), [])

@ -0,0 +1,24 @@
<html>
<head>
<title>Min Document Title</title>
<script src="something.js"></script>
<script src="something.js" />
<link rel="stylesheet" href="style.css" type="text/css">
<style type="text/css">
body {
}
</style>
</head>
<body>
<h1>Min Document</h1>
<a href="/about.hml">About Us</a>
<a href="http://blog.mitechie.com/test.hml">About Us</a>
<a href="http://amazon.com/test.hml">Amazon</a>
<div id="footer">
<script type="text/javascript">
// please go away for readability
</script>
</div>
</body>
</html>
Loading…
Cancel
Save