Profile and adjust for performance, add bugfix to parse out mitechie blog post

12 years ago · e7873d3d92
parent 6b16b7b21f
commit e7873d3d92
3 changed files with 21 additions and 22 deletions
--- a/src/breadability/document.py
+++ b/src/breadability/document.py
@ -1,3 +1,5 @@
+"""Generate a clean nice starting html document to process for an article."""
+
 import chardet
 import logging
 import re
--- a/src/breadability/readable.py
+++ b/src/breadability/readable.py
@ -12,6 +12,7 @@ from breadability.scoring import get_class_weight
 from breadability.scoring import is_unlikely_node
 from breadability.utils import cached_property

+
 html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
                  style=True, links=True, meta=False, add_nofollow=False,
                  page_structure=False, processing_instructions=True,
@ -149,7 +150,7 @@ def prep_article(doc):

    """
    def clean_document(node):
-        """Remove the style attribute on every element."""
+        """Clean up the final document we return as the readable article"""
        clean_list = ['object', 'h1']
        keep_keywords = ['youtube', 'blip.tv', 'vimeo']

@ -158,9 +159,10 @@ def prep_article(doc):
        if len(node.findall('.//h2')) == 1:
            clean_list.append('h2')

-        for n in node.getiterator():
+        for n in node.iter():
            # clean out any incline style properties
-            n.set('style', '')
+            if 'style' in n.attrib:
+                n.set('style', '')

            # remove all of the following tags
            # Clean a node of all elements of type "tag".
@ -187,7 +189,9 @@ def prep_article(doc):
                # if the heading has no css weight or a high link density,
                # remove it
                if get_class_weight(n) < 0 or get_link_density(n) > .33:
-                    n.drop_tree()
+                    # for some reason we get nodes here without a parent
+                    if n.getparent() is not None:
+                        n.drop_tree()

            # clean out extra <p>
            if n.tag == 'p':
@ -205,29 +209,22 @@ def prep_article(doc):
    return doc


-def process(doc):
-    """Process this doc to make it readable.
+def find_candidates(doc):
+    """Find cadidate nodes for the readable version of the article.

    Here's we're going to remove unlikely nodes, find scores on the rest, and
    clean up and return the final best match.

    """
-    unlikely = []
    scorable_node_tags = ['p', 'td', 'pre']
    nodes_to_score = []

-    for node in doc.getiterator():
+    for node in doc.iter():
        if is_unlikely_node(node):
-            unlikely.append(node)
-
-        if node.tag in scorable_node_tags:
+            node.drop_tree()
+        elif node.tag in scorable_node_tags:
            nodes_to_score.append(node)
-
-    # process our clean up instructions
-    [n.drop_tree() for n in unlikely]
-
-    candidates = score_candidates(nodes_to_score)
-    return candidates
+    return score_candidates(nodes_to_score)


 class Article(object):
@ -250,7 +247,7 @@ class Article(object):
        html_cleaner(doc)
        doc = drop_tag(doc, 'noscript')
        doc = transform_misused_divs_into_paragraphs(doc)
-        candidates = process(doc)
+        candidates = find_candidates(doc)

        if candidates:
            # right now we return the highest scoring candidate content
--- a/src/breadability/scoring.py
+++ b/src/breadability/scoring.py
@ -1,3 +1,6 @@
+"""Handle dealing with scoring nodes and content for our parsing."""
+import re
+
 # A series of sets of attributes we check to help in determining if a node is
 # a potential candidate or not.
 CLS_UNLIKELY = set([
@ -32,7 +35,7 @@ def get_link_density(node):
    :returns float:

    """
-    link_length = len("".join([a.text or "" for a in node.findall(".//a")]))
+    link_length = sum([len(a.text_content()) or 0 for a in node.findall(".//a")])
    text_length = len(node.text_content())
    return float(link_length) / max(text_length, 1)

@ -152,6 +155,3 @@ class ScoredNode(object):
            content_score = -5
        content_score += get_class_weight(node)
        self.content_score = content_score
-
-
-