Start to add some processing for the readable contnet

- Add removal of style, script, etc bits in the content
12 years ago · e93a52a748
parent 2e7fb0aa89
commit e93a52a748
5 changed files with 73 additions and 14 deletions
--- a/src/breadability/readable.py
+++ b/src/breadability/readable.py
@ -1,10 +1,23 @@
-from breadable.document import OriginalDocument
+from breadability.document import OriginalDocument
+from breadability.utils import cached_property
+
+def drop_tag(doc, *tags):
+    [[n.drop_tree() for n in doc.iterfind(".//" + tag)]
+            for tag in tags]
+    return doc


 class Article(object):
    """Parsed readable object"""

-    def __init__(self):
-        pass
+    def __init__(self, html, url=None):
+        self.orig = OriginalDocument(html, url=url)
+

+    @cached_property(ttl=600)
+    def readable(self):
+        """The readable parsed article"""
+        doc = self.orig.html
+        doc = drop_tag(doc, 'script', 'link', 'style', 'noscript')
+        return doc

--- a/src/breadability/tests/init.py
+++ b/src/breadability/tests/init.py
@ -0,0 +1,9 @@
+from os import path
+
+
+TEST_DIR = path.dirname(__file__)
+
+
+def load_snippet(filename):
+    """Helper to fetch in the content of a test snippet"""
+    return open(path.join(TEST_DIR, 'test_snippets', filename)).read()
--- a/src/breadability/tests/test_orig_document.py
+++ b/src/breadability/tests/test_orig_document.py
@ -1,19 +1,12 @@
 from collections import defaultdict
-from os import path
 from unittest import TestCase

 from breadability.document import OriginalDocument
-
-
-TEST_DIR = path.dirname(__file__)
-
-
-def load_snippet(filename):
-    """Helper to fetch in the content of a test snippet"""
-    return open(path.join(TEST_DIR, 'test_snippets', filename)).read()
+from breadability.tests import load_snippet


 class TestOriginalDocuemtn(TestCase):
+
    """Verify we can process html into a document to work off of."""

    def test_readin_min_document(self):
@ -44,5 +37,3 @@ class TestOriginalDocuemtn(TestCase):

        self.assertEqual(link_counts['blog'], 2)
        self.assertEqual(link_counts['other'], 1)
-
-
--- a/src/breadability/tests/test_readable.py
+++ b/src/breadability/tests/test_readable.py
@ -0,0 +1,22 @@
+from collections import defaultdict
+from unittest import TestCase
+
+from breadability.readable import Article
+from breadability.tests import load_snippet
+
+
+class TestOriginalDocument(TestCase):
+    """Verify we can process html into a document to work off of."""
+
+    def test_load_doc(self):
+        """We get back an element tree from our original doc"""
+        doc = Article(load_snippet('document_min.html'))
+        self.assertEqual(doc.readable.tag, 'html')
+
+    def test_doc_no_scripts_styles(self):
+        """Step #1 remove all scripts from the document"""
+        doc = Article(load_snippet('document_scripts.html'))
+        readable = doc.readable
+        self.assertEqual(readable.findall(".//script"), [])
+        self.assertEqual(readable.findall(".//style"), [])
+        self.assertEqual(readable.findall(".//link"), [])
--- a/src/breadability/tests/test_snippets/document_scripts.html
+++ b/src/breadability/tests/test_snippets/document_scripts.html
@ -0,0 +1,24 @@
+<html>
+    <head>
+        <title>Min Document Title</title>
+        <script src="something.js"></script>
+        <script src="something.js" />
+        <link rel="stylesheet" href="style.css" type="text/css">
+        <style type="text/css">
+            body {
+
+            }
+        </style>
+    </head>
+    <body>
+        <h1>Min Document</h1>
+        <a href="/about.hml">About Us</a>
+        <a href="http://blog.mitechie.com/test.hml">About Us</a>
+        <a href="http://amazon.com/test.hml">Amazon</a>
+        <div id="footer">
+            <script type="text/javascript">
+                // please go away for readability
+            </script>
+        </div>
+    </body>
+</html>