Some refactoring starts to help us org tests/code

12 years ago · ab79d9632b
parent ccac04e567
commit ab79d9632b
3 changed files with 171 additions and 468 deletions
--- a/src/breadability/readable.py
+++ b/src/breadability/readable.py
@ -6,25 +6,12 @@ from lxml.html.clean import Cleaner
 from lxml.html import fragment_fromstring
 from lxml.html import fromstring
 from breadability.document import OriginalDocument
+from breadability.scoring import score_candidates
+from breadability.scoring import get_link_density
+from breadability.scoring import get_class_weight
+from breadability.scoring import is_unlikely_node
 from breadability.utils import cached_property

-# A series of sets of attributes we check to help in determining if a node is
-# a potential candidate or not.
-CLS_UNLIKELY = set([
-    'combx', 'comment', 'community', 'disqus', 'extra', 'foot', 'header',
-    'menu', '' 'remark', 'rss', 'shoutbox', 'sidebar', 'sponsor', 'ad-break',
-    'agegate', 'pagination' '', 'pager', 'popup', 'tweet', 'twitter',
-])
-CLS_MAYBE = set([
-    'and', 'article', 'body', 'column', 'main', 'shadow',
-])
-CLS_WEIGHT_POSITIVE = set(['article', 'body', 'content', 'entry', 'hentry',
-    'main', 'page', 'pagination', 'post', 'text', 'blog', 'story'])
-CLS_WEIGHT_NEGATIVE = set(['combx', 'comment', 'com-', 'contact', 'foot',
-    'footer', 'footnote', 'masthead', 'media', 'meta', 'outbrain', 'promo',
-    'related', 'scroll', 'shoutbox', 'sidebar', 'sponsor', 'shopping', 'tags',
-    'tool', 'widget'])
-
 html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
                  style=True, links=True, meta=False, add_nofollow=False,
                  page_structure=False, processing_instructions=True,
@ -33,15 +20,6 @@ html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
                  remove_unknown_tags=False, safe_attrs_only=False)


-def check_node_attr(node, attr, checkset):
-    attr = node.get(attr) or ""
-    check = set(attr.lower().split(' '))
-    if check.intersection(checkset):
-        return True
-    else:
-        return False
-
-
 def drop_tag(doc, *tags):
    """Helper to just remove any nodes that match this html tag passed in

@ -79,18 +57,6 @@ def build_base_document(html):
    return found_body


-def get_link_density(node):
-    """Generate a value for the number of links in the node.
-
-    :param node: pared elementree node
-    :returns float:
-
-    """
-    link_length = len("".join([a.text or "" for a in node.findall(".//a")]))
-    text_length = len(node.text_content())
-    return float(link_length) / max(text_length, 1)
-
-
 def transform_misused_divs_into_paragraphs(doc):
    """Turn all divs that don't have children block level elements into p's

@ -172,79 +138,6 @@ def check_siblings(candidate_node, candidate_list):
    return candidate_node


-###### SCORING
-
-
-def get_class_weight(node):
-    """Get an elements class/id weight.
-
-    We're using sets to help efficiently check for existence of matches.
-
-    """
-    weight = 0
-    if check_node_attr(node, 'class', CLS_WEIGHT_NEGATIVE):
-        weight = weight - 25
-    if check_node_attr(node, 'class', CLS_WEIGHT_POSITIVE):
-        weight = weight + 25
-
-    if check_node_attr(node, 'id', CLS_WEIGHT_NEGATIVE):
-        weight = weight - 25
-    if check_node_attr(node, 'id', CLS_WEIGHT_POSITIVE):
-        weight = weight + 25
-
-    return weight
-
-
-def score_candidates(nodes):
-    """Given a list of potential nodes, find some initial scores to start"""
-    MIN_HIT_LENTH = 25
-    candidates = {}
-
-    for node in nodes:
-        content_score = 0
-        parent = node.getparent()
-        grand = parent.getparent() if parent is not None else None
-        innertext = node.text
-
-        if parent is None or grand is None:
-            continue
-
-        # If this paragraph is less than 25 characters, don't even count it.
-        if innertext and len(innertext) < MIN_HIT_LENTH:
-            continue
-
-        # Initialize readability data for the parent.
-        # if the parent node isn't in the candidate list, add it
-        if parent not in candidates:
-            candidates[parent] = CandidateNode(parent)
-
-        if grand not in candidates:
-            candidates[grand] = CandidateNode(grand)
-
-        # Add a point for the paragraph itself as a base.
-        content_score += 1
-
-        # Add points for any commas within this paragraph
-        content_score += innertext.count(',') if innertext else 0
-
-        # For every 100 characters in this paragraph, add another point. Up to
-        # 3 points.
-        length_points = len(innertext) % 100 if innertext else 0
-        content_score = length_points if length_points > 3 else 3
-
-        # Add the score to the parent. The grandparent gets half. */
-        if parent is not None:
-            candidates[parent].content_score += content_score
-        if grand is not None:
-            candidates[grand].content_score += content_score
-
-        for candidate in candidates.values():
-            candidate.content_score = candidate.content_score * (1 -
-                    get_link_density(candidate.node))
-
-    return candidates
-
-
 def prep_article(doc):
    """Once we've found our target article we want to clean it up.

@ -256,17 +149,17 @@ def prep_article(doc):

    """

-    def clean_document(candidate):
+    def clean_document(node):
        """Remove the style attribute on every element."""
        clean_list = ['object', 'h1']
        keep_keywords = ['youtube', 'blip.tv', 'vimeo']

        # If there is only one h2, they are probably using it as a header and
        # not a subheader, so remove it since we already have a header.
-        if len(candidate.node.findall('.//h2')) == 1:
+        if len(node.findall('.//h2')) == 1:
            clean_list.append('h2')

-        for n in candidate.node.getiterator():
+        for n in node.getiterator():
            # clean out any incline style properties
            n.set('style', '')

@ -304,7 +197,7 @@ def prep_article(doc):
                if not n.getchildren() and len(n.text_content()) < 5:
                    n.drop_tree()

-        return candidate
+        return node

    def clean_conditionally(doc, clean_el):
        """Remove the clean_el if it looks like bad content based on rules."""
@ -327,24 +220,6 @@ def process(doc):
    scorable_node_tags = ['p', 'td', 'pre']
    nodes_to_score = []

-    def is_unlikely_node(node):
-        """Short helper for checking unlikely status.
-
-        If the class or id are in the unlikely list, and there's not also a
-        class/id in the likely list then it might need to be removed.
-
-        """
-        unlikely = check_node_attr(node, 'class', CLS_UNLIKELY) or \
-            check_node_attr(node, 'id', CLS_UNLIKELY)
-
-        maybe = check_node_attr(node, 'class', CLS_MAYBE) or \
-            check_node_attr(node, 'id', CLS_MAYBE)
-
-        if unlikely and not maybe and node.tag != 'body':
-            return True
-        else:
-            return False
-
    for node in doc.getiterator():
        if is_unlikely_node(node):
            unlikely.append(node)
@ -359,34 +234,6 @@ def process(doc):
    return candidates


-class CandidateNode(object):
-    """We need Candidate nodes we use to track possible article matches
-
-    We might have a bunch of these so we use __slots__ to keep memory usage
-    down.
-
-    """
-    __slots__ = ['node', 'content_score']
-
-    def __init__(self, node):
-        """Given node, set an initial score and weigh based on css and id"""
-        self.node = node
-        content_score = 0
-        if node.tag == 'div':
-            content_score = 5
-
-        if node.tag in ['pre', 'td', 'blockquote']:
-            content_score = 3
-
-        if node.tag in ['address', 'ol', 'ul', 'dl', 'dd', 'dt', 'li',
-            'form']:
-            content_score = -3
-        if node.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th']:
-            content_score = -5
-        content_score += get_class_weight(node)
-        self.content_score = content_score
-
-
 class Article(object):
    """Parsed readable object"""

@ -418,311 +265,10 @@ class Article(object):
            # for extra content
            winner = by_score[0]
            updated_winner = check_siblings(winner, candidates)
-            doc = prep_article(updated_winner)
+            updated_winner.node = prep_article(updated_winner.node)
            doc = build_base_document(updated_winner.node)
        else:
            doc = prep_article(doc)
            doc = build_base_document(doc)

        return doc
-
-
-
-"""
-Algorithm notes for
-
-
-    /***
-     * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
-     *               most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
-     *
-     * @param page a document to run upon. Needs to be a full document, complete with body.
-     * @return Element
-    **/
-    grabArticle: function (page) {
-        var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS),
-            isPaging = (page !== null) ? true: false;
-
-        page = page ? page : document.body;
-
-        var pageCacheHtml = page.innerHTML;
-
-        var allElements = page.getElementsByTagName('*');
-
-        /**
-         * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
-         * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
-         *
-         * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
-         * TODO: Shouldn't this be a reverse traversal?
-        **/
-        var node = null;
-        var nodesToScore = [];
-        for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) {
-            /* Remove unlikely candidates */
-            if (stripUnlikelyCandidates) {
-                var unlikelyMatchString = node.className + node.id;
-                if (
-                    (
-                        unlikelyMatchString.search(readability.regexps.unlikelyCandidates) !== -1 &&
-                        unlikelyMatchString.search(readability.regexps.okMaybeItsACandidate) === -1 &&
-                        node.tagName !== "BODY"
-                    )
-                )
-                {
-                    dbg("Removing unlikely candidate - " + unlikelyMatchString);
-                    node.parentNode.removeChild(node);
-                    nodeIndex-=1;
-                    continue;
-                }
-            }
-
-            if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") {
-                nodesToScore[nodesToScore.length] = node;
-            }
-
-            /* Turn all divs that don't have children block level elements into p's */
-            if (node.tagName === "DIV") {
-                if (node.innerHTML.search(readability.regexps.divToPElements) === -1) {
-                    var newNode = document.createElement('p');
-                    try {
-                        newNode.innerHTML = node.innerHTML;
-                        node.parentNode.replaceChild(newNode, node);
-                        nodeIndex-=1;
-
-                        nodesToScore[nodesToScore.length] = node;
-                    }
-                    catch(e) {
-                        dbg("Could not alter div to p, probably an IE restriction, reverting back to div.: " + e);
-                    }
-                }
-                else
-                {
-                    /* EXPERIMENTAL */
-                    for(var i = 0, il = node.childNodes.length; i < il; i+=1) {
-                        var childNode = node.childNodes[i];
-                        if(childNode.nodeType === 3) { // Node.TEXT_NODE
-                            var p = document.createElement('p');
-                            p.innerHTML = childNode.nodeValue;
-                            p.style.display = 'inline';
-                            p.className = 'readability-styled';
-                            childNode.parentNode.replaceChild(p, childNode);
-                        }
-                    }
-                }
-            }
-        }
-
-        /**
-         * Loop through all paragraphs, and assign a score to them based on how content-y they look.
-         * Then add their score to their parent node.
-         *
-         * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
-        **/
-        var candidates = [];
-        for (var pt=0; pt < nodesToScore.length; pt+=1) {
-            var parentNode      = nodesToScore[pt].parentNode;
-            var grandParentNode = parentNode ? parentNode.parentNode : null;
-            var innerText       = readability.getInnerText(nodesToScore[pt]);
-
-            if(!parentNode || typeof(parentNode.tagName) === 'undefined') {
-                continue;
-            }
-
-            /* If this paragraph is less than 25 characters, don't even count it. */
-            if(innerText.length < 25) {
-                continue; }
-
-            /* Initialize readability data for the parent. */
-            if(typeof parentNode.readability === 'undefined') {
-                readability.initializeNode(parentNode);
-                candidates.push(parentNode);
-            }
-
-            /* Initialize readability data for the grandparent. */
-            if(grandParentNode && typeof(grandParentNode.readability) === 'undefined' && typeof(grandParentNode.tagName) !== 'undefined') {
-                readability.initializeNode(grandParentNode);
-                candidates.push(grandParentNode);
-            }
-
-            var contentScore = 0;
-
-            /* Add a point for the paragraph itself as a base. */
-            contentScore+=1;
-
-            /* Add points for any commas within this paragraph */
-            contentScore += innerText.split(',').length;
-
-            /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
-            contentScore += Math.min(Math.floor(innerText.length / 100), 3);
-
-            /* Add the score to the parent. The grandparent gets half. */
-            parentNode.readability.contentScore += contentScore;
-
-            if(grandParentNode) {
-                grandParentNode.readability.contentScore += contentScore/2;
-            }
-        }
-
-        /**
-         * After we've calculated scores, loop through all of the possible candidate nodes we found
-         * and find the one with the highest score.
-        **/
-        var topCandidate = null;
-        for(var c=0, cl=candidates.length; c < cl; c+=1)
-        {
-            /**
-             * Scale the final candidates score based on link density. Good content should have a
-             * relatively small link density (5% or less) and be mostly unaffected by this operation.
-            **/
-            candidates[c].readability.contentScore = candidates[c].readability.contentScore * (1-readability.getLinkDensity(candidates[c]));
-
-            dbg('Candidate: ' + candidates[c] + " (" + candidates[c].className + ":" + candidates[c].id + ") with score " + candidates[c].readability.contentScore);
-
-            if(!topCandidate || candidates[c].readability.contentScore > topCandidate.readability.contentScore) {
-                topCandidate = candidates[c]; }
-        }
-
-        /**
-         * If we still have no top candidate, just use the body as a last resort.
-         * We also have to copy the body node so it is something we can modify.
-         **/
-        if (topCandidate === null || topCandidate.tagName === "BODY")
-        {
-            topCandidate = document.createElement("DIV");
-            topCandidate.innerHTML = page.innerHTML;
-            page.innerHTML = "";
-            page.appendChild(topCandidate);
-            readability.initializeNode(topCandidate);
-        }
-
-        /**
-         * Now that we have the top candidate, look through its siblings for content that might also be related.
-         * Things like preambles, content split by ads that we removed, etc.
-        **/
-        var articleContent        = document.createElement("DIV");
-        if (isPaging) {
-            articleContent.id     = "readability-content";
-        }
-        var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
-        var siblingNodes          = topCandidate.parentNode.childNodes;
-
-
-        for(var s=0, sl=siblingNodes.length; s < sl; s+=1) {
-            var siblingNode = siblingNodes[s];
-            var append      = false;
-
-            /**
-             * Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList.
-             * Example of error visible here: http://www.esquire.com/features/honesty0707
-            **/
-            if(!siblingNode) {
-                continue;
-            }
-
-            dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));
-            dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));
-
-            if(siblingNode === topCandidate)
-            {
-                append = true;
-            }
-
-            var contentBonus = 0;
-            /* Give a bonus if sibling nodes and top candidates have the example same classname */
-            if(siblingNode.className === topCandidate.className && topCandidate.className !== "") {
-                contentBonus += topCandidate.readability.contentScore * 0.2;
-            }
-
-            if(typeof siblingNode.readability !== 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold)
-            {
-                append = true;
-            }
-
-            if(siblingNode.nodeName === "P") {
-                var linkDensity = readability.getLinkDensity(siblingNode);
-                var nodeContent = readability.getInnerText(siblingNode);
-                var nodeLength  = nodeContent.length;
-
-                if(nodeLength > 80 && linkDensity < 0.25)
-                {
-                    append = true;
-                }
-                else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1)
-                {
-                    append = true;
-                }
-            }
-
-            if(append) {
-                dbg("Appending node: " + siblingNode);
-
-                var nodeToAppend = null;
-                if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") {
-                    /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
-
-                    dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');
-                    nodeToAppend = document.createElement("DIV");
-                    try {
-                        nodeToAppend.id = siblingNode.id;
-                        nodeToAppend.innerHTML = siblingNode.innerHTML;
-                    }
-                    catch(er) {
-                        dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");
-                        nodeToAppend = siblingNode;
-                        s-=1;
-                        sl-=1;
-                    }
-                } else {
-                    nodeToAppend = siblingNode;
-                    s-=1;
-                    sl-=1;
-                }
-
-                /* To ensure a node does not interfere with readability styles, remove its classnames */
-                nodeToAppend.className = "";
-
-                /* Append sibling and subtract from our list because it removes the node when you append to another node */
-                articleContent.appendChild(nodeToAppend);
-            }
-        }
-
-        /**
-         * So we have all of the content that we need. Now we clean it up for presentation.
-        **/
-        readability.prepArticle(articleContent);
-
-        if (readability.curPageNum === 1) {
-            articleContent.innerHTML = '<div id="readability-page-1" class="page">' + articleContent.innerHTML + '</div>';
-        }
-
-        /**
-         * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
-         * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
-         * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
-         * finding the -right- content.
-        **/
-        if(readability.getInnerText(articleContent, false).length < 250) {
-        page.innerHTML = pageCacheHtml;
-
-            if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {
-                readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);
-                return readability.grabArticle(page);
-            }
-            else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
-                readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);
-                return readability.grabArticle(page);
-            }
-            else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {
-                readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY);
-                return readability.grabArticle(page);
-            } else {
-                return null;
-            }
-        }
-
-        return articleContent;
-    },
-
-    /**
-"""
--- a/src/breadability/scoring.py
+++ b/src/breadability/scoring.py
@ -0,0 +1,157 @@
+# A series of sets of attributes we check to help in determining if a node is
+# a potential candidate or not.
+CLS_UNLIKELY = set([
+    'combx', 'comment', 'community', 'disqus', 'extra', 'foot', 'header',
+    'menu', '' 'remark', 'rss', 'shoutbox', 'sidebar', 'sponsor', 'ad-break',
+    'agegate', 'pagination' '', 'pager', 'popup', 'tweet', 'twitter',
+])
+CLS_MAYBE = set([
+    'and', 'article', 'body', 'column', 'main', 'shadow',
+])
+CLS_WEIGHT_POSITIVE = set(['article', 'body', 'content', 'entry', 'hentry',
+    'main', 'page', 'pagination', 'post', 'text', 'blog', 'story'])
+CLS_WEIGHT_NEGATIVE = set(['combx', 'comment', 'com-', 'contact', 'foot',
+    'footer', 'footnote', 'masthead', 'media', 'meta', 'outbrain', 'promo',
+    'related', 'scroll', 'shoutbox', 'sidebar', 'sponsor', 'shopping', 'tags',
+    'tool', 'widget'])
+
+
+def check_node_attr(node, attr, checkset):
+    attr = node.get(attr) or ""
+    check = set(attr.lower().split(' '))
+    if check.intersection(checkset):
+        return True
+    else:
+        return False
+
+
+def get_link_density(node):
+    """Generate a value for the number of links in the node.
+
+    :param node: pared elementree node
+    :returns float:
+
+    """
+    link_length = len("".join([a.text or "" for a in node.findall(".//a")]))
+    text_length = len(node.text_content())
+    return float(link_length) / max(text_length, 1)
+
+
+def get_class_weight(node):
+    """Get an elements class/id weight.
+
+    We're using sets to help efficiently check for existence of matches.
+
+    """
+    weight = 0
+    if check_node_attr(node, 'class', CLS_WEIGHT_NEGATIVE):
+        weight = weight - 25
+    if check_node_attr(node, 'class', CLS_WEIGHT_POSITIVE):
+        weight = weight + 25
+
+    if check_node_attr(node, 'id', CLS_WEIGHT_NEGATIVE):
+        weight = weight - 25
+    if check_node_attr(node, 'id', CLS_WEIGHT_POSITIVE):
+        weight = weight + 25
+
+    return weight
+
+
+def is_unlikely_node(node):
+    """Short helper for checking unlikely status.
+
+    If the class or id are in the unlikely list, and there's not also a
+    class/id in the likely list then it might need to be removed.
+
+    """
+    unlikely = check_node_attr(node, 'class', CLS_UNLIKELY) or \
+        check_node_attr(node, 'id', CLS_UNLIKELY)
+
+    maybe = check_node_attr(node, 'class', CLS_MAYBE) or \
+        check_node_attr(node, 'id', CLS_MAYBE)
+
+    if unlikely and not maybe and node.tag != 'body':
+        return True
+    else:
+        return False
+
+
+def score_candidates(nodes):
+    """Given a list of potential nodes, find some initial scores to start"""
+    MIN_HIT_LENTH = 25
+    candidates = {}
+
+    for node in nodes:
+        content_score = 0
+        parent = node.getparent()
+        grand = parent.getparent() if parent is not None else None
+        innertext = node.text
+
+        if parent is None or grand is None:
+            continue
+
+        # If this paragraph is less than 25 characters, don't even count it.
+        if innertext and len(innertext) < MIN_HIT_LENTH:
+            continue
+
+        # Initialize readability data for the parent.
+        # if the parent node isn't in the candidate list, add it
+        if parent not in candidates:
+            candidates[parent] = ScoredNode(parent)
+
+        if grand not in candidates:
+            candidates[grand] = ScoredNode(grand)
+
+        # Add a point for the paragraph itself as a base.
+        content_score += 1
+
+        # Add points for any commas within this paragraph
+        content_score += innertext.count(',') if innertext else 0
+
+        # For every 100 characters in this paragraph, add another point. Up to
+        # 3 points.
+        length_points = len(innertext) % 100 if innertext else 0
+        content_score = length_points if length_points > 3 else 3
+
+        # Add the score to the parent. The grandparent gets half. */
+        if parent is not None:
+            candidates[parent].content_score += content_score
+        if grand is not None:
+            candidates[grand].content_score += content_score
+
+        for candidate in candidates.values():
+            candidate.content_score = candidate.content_score * (1 -
+                    get_link_density(candidate.node))
+
+    return candidates
+
+
+class ScoredNode(object):
+    """We need Scored nodes we use to track possible article matches
+
+    We might have a bunch of these so we use __slots__ to keep memory usage
+    down.
+
+    """
+    __slots__ = ['node', 'content_score']
+
+    def __init__(self, node):
+        """Given node, set an initial score and weigh based on css and id"""
+        self.node = node
+        content_score = 0
+        if node.tag == 'div':
+            content_score = 5
+
+        if node.tag in ['pre', 'td', 'blockquote']:
+            content_score = 3
+
+        if node.tag in ['address', 'ol', 'ul', 'dl', 'dd', 'dt', 'li',
+            'form']:
+            content_score = -3
+        if node.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th']:
+            content_score = -5
+        content_score += get_class_weight(node)
+        self.content_score = content_score
+
+
+
--- a/src/breadability/tests/test_readable.py
+++ b/src/breadability/tests/test_readable.py
@ -4,11 +4,11 @@ from lxml.html import fragment_fromstring
 from unittest import TestCase

 from breadability.readable import Article
-from breadability.readable import CandidateNode
 from breadability.readable import get_class_weight
 from breadability.readable import get_link_density
 from breadability.readable import score_candidates
 from breadability.readable import transform_misused_divs_into_paragraphs
+from breadability.scoring import ScoredNode
 from breadability.tests import load_snippet
 from breadability.tests import load_article

@ -130,19 +130,19 @@ class TestCandidateNodes(TestCase):

        for n in fives:
            doc = fragment_fromstring(n)
-            self.assertEqual(CandidateNode(doc).content_score, 5)
+            self.assertEqual(ScoredNode(doc).content_score, 5)

        for n in threes:
            doc = fragment_fromstring(n)
-            self.assertEqual(CandidateNode(doc).content_score, 3)
+            self.assertEqual(ScoredNode(doc).content_score, 3)

        for n in neg_threes:
            doc = fragment_fromstring(n)
-            self.assertEqual(CandidateNode(doc).content_score, -3)
+            self.assertEqual(ScoredNode(doc).content_score, -3)

        for n in neg_fives:
            doc = fragment_fromstring(n)
-            self.assertEqual(CandidateNode(doc).content_score, -5)
+            self.assertEqual(ScoredNode(doc).content_score, -5)


 class TestClassWeights(TestCase):