Add some more debugging to support tracing wtf we did and why

12 years ago · 14bbe701eb
parent 00ba7e5164
commit 14bbe701eb
3 changed files with 36 additions and 5 deletions
--- a/src/breadability/document.py
+++ b/src/breadability/document.py
@ -8,10 +8,10 @@ from lxml.etree import tounicode
 from lxml.html import document_fromstring
 from lxml.html import HTMLParser

+from breadability.logconfig import LOG
 from breadability.utils import cached_property


-LOG = logging.getLogger(__name__)
 utf8_parser = HTMLParser(encoding='utf-8')


@ -38,6 +38,7 @@ def get_encoding(page):

 def replace_multi_br_to_paragraphs(html):
    """Convert multiple <br>s into paragraphs"""
+    LOG.debug('Replacing multiple <br/> to <p>')
    rep = re.compile("(<br[^>]*>[ \n\r\t]*){2,}", re.I)
    return rep.sub('</p><p>', html)

@ -81,6 +82,7 @@ class OriginalDocument(object):
        # doc = html_cleaner.clean_html(doc)
        base_href = self.url
        if base_href:
+            LOG.debug('Making links absolute')
            doc.make_links_absolute(base_href, resolve_base_href=True)
        else:
            doc.resolve_base_href()
--- a/src/breadability/readable.py
+++ b/src/breadability/readable.py
@ -5,6 +5,8 @@ from lxml.etree import tostring
 from lxml.html.clean import Cleaner
 from lxml.html import fragment_fromstring
 from lxml.html import fromstring
+from pprint import PrettyPrinter
+
 from breadability.document import OriginalDocument
 from breadability.logconfig import LOG
 from breadability.scoring import score_candidates
@ -30,8 +32,9 @@ def drop_tag(doc, *tags):
    """
    for tag in tags:
        found = doc.iterfind(".//" + tag)
-        if found:
-            [n.drop_tree for n in found]
+        for n in found:
+            LOG.debug("Dropping tag: " + str(n))
+            n.drop_tree()
    return doc


@ -78,11 +81,11 @@ def transform_misused_divs_into_paragraphs(doc):
            # We need to create a <p> and put all it's contents in there
            # We'll just stringify it, then regex replace the first/last
            # div bits to turn them into <p> vs <div>.
+            LOG.debug('Turning leaf <div> into <p>')
            orig = tounicode(elem).strip()
            started = re.sub(r'^<\s*div', '<p', orig)
            ended = re.sub(r'div>$', 'p>', started)
            elem.getparent().replace(elem, fromstring(ended))
-
    return doc


@ -129,6 +132,7 @@ def check_siblings(candidate_node, candidate_list):
                    append = True

        if append:
+            LOG.debug('Sibling being appended' + str(sibling))
            if sibling.tag not in ['div', 'p']:
                # We have a node that isn't a common block level element, like
                # a form or td tag. Turn it into a div so it doesn't get
@ -152,12 +156,14 @@ def prep_article(doc):
    """
    def clean_document(node):
        """Clean up the final document we return as the readable article"""
+        LOG.debug('Cleaning document')
        clean_list = ['object', 'h1']
        keep_keywords = ['youtube', 'blip.tv', 'vimeo']

        # If there is only one h2, they are probably using it as a header and
        # not a subheader, so remove it since we already have a header.
        if len(node.findall('.//h2')) == 1:
+            LOG.debug('Adding H2 to list of nodes to clean.')
            clean_list.append('h2')

        for n in node.iter():
@ -183,7 +189,10 @@ def prep_article(doc):
                        if not allow and key in node_str:
                            allow = True
                if not allow:
+                    LOG.debug('Dropping node: ' + str(n))
                    n.drop_tree()
+                    # go on with next loop, this guy is gone
+                    continue

            if n.tag in ['h1', 'h2', 'h3', 'h4']:
                # clean headings
@ -192,15 +201,22 @@ def prep_article(doc):
                if get_class_weight(n) < 0 or get_link_density(n) > .33:
                    # for some reason we get nodes here without a parent
                    if n.getparent() is not None:
+                        LOG.debug(
+                            "Dropping <hX>, it's insignificant: " + str(n))
                        n.drop_tree()
+                        # go on with next loop, this guy is gone
+                        continue
+

            # clean out extra <p>
            if n.tag == 'p':
                # if the p has no children and has no content...well then down
                # with it.
                if not n.getchildren() and len(n.text_content()) < 5:
+                    LOG.debug('Dropping extra <p>: ' + str(n))
                    n.drop_tree()
-
+                    # go on with next loop, this guy is gone
+                    continue
        return node

    def clean_conditionally(doc, clean_el):
@ -222,6 +238,7 @@ def find_candidates(doc):

    for node in doc.iter():
        if is_unlikely_node(node):
+            LOG.debug('Dropping unlikely: ' + str(node))
            node.drop_tree()
        elif node.tag in scorable_node_tags:
            nodes_to_score.append(node)
@ -252,6 +269,10 @@ class Article(object):
        candidates = find_candidates(doc)

        if candidates:
+            LOG.debug('Candidates found:')
+            pp = PrettyPrinter(indent=2)
+            LOG.debug(pp.pformat(candidates))
+
            # right now we return the highest scoring candidate content
            by_score = sorted([c for c in candidates.values()],
                key=attrgetter('content_score'), reverse=True)
@ -259,10 +280,14 @@ class Article(object):
            # since we have several candidates, check the winner's siblings
            # for extra content
            winner = by_score[0]
+            LOG.debug('Selected winning node: ' + str(winner))
            updated_winner = check_siblings(winner, candidates)
+            LOG.debug('Begin final prep of article')
            updated_winner.node = prep_article(updated_winner.node)
            doc = build_base_document(updated_winner.node)
        else:
+            LOG.warning('No candidates found: using document.')
+            LOG.debug('Begin final prep of article')
            doc = prep_article(doc)
            doc = build_base_document(doc)

--- a/src/breadability/scoring.py
+++ b/src/breadability/scoring.py
@ -138,6 +138,10 @@ class ScoredNode(object):
    """
    __slots__ = ['node', 'content_score']

+    def __repr__(self):
+        """Helpful representation of our Scored Node"""
+        return "{0:0.1F}\t{1}".format(self.content_score, self.node)
+
    def __init__(self, node):
        """Given node, set an initial score and weigh based on css and id"""
        self.node = node