Don't need the old versions any more
parent
f4fa0c1040
commit
4cbde9cb5a
@ -1,508 +0,0 @@
|
||||
import re
|
||||
from lxml.etree import tounicode
|
||||
from lxml.etree import tostring
|
||||
from lxml.html.clean import Cleaner
|
||||
from lxml.html import fragment_fromstring
|
||||
from lxml.html import fromstring
|
||||
from operator import attrgetter
|
||||
from pprint import PrettyPrinter
|
||||
|
||||
from breadability.document import OriginalDocument
|
||||
from breadability.logconfig import LOG
|
||||
from breadability.logconfig import LNODE
|
||||
from breadability.scoring import score_candidates
|
||||
from breadability.scoring import get_link_density
|
||||
from breadability.scoring import get_class_weight
|
||||
from breadability.scoring import is_unlikely_node
|
||||
from breadability.utils import cached_property
|
||||
|
||||
|
||||
html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
|
||||
style=True, links=True, meta=False, add_nofollow=False,
|
||||
page_structure=False, processing_instructions=True,
|
||||
embedded=False, frames=False, forms=False,
|
||||
annoying_tags=False, remove_tags=None,
|
||||
remove_unknown_tags=False, safe_attrs_only=False)
|
||||
|
||||
|
||||
BASE_DOC = """
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
|
||||
</head>
|
||||
<body>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
SCORABLE_TAGS = ['div', 'p', 'td', 'pre', 'article']
|
||||
|
||||
|
||||
def drop_tag(doc, *tags):
|
||||
"""Helper to just remove any nodes that match this html tag passed in
|
||||
|
||||
:param *tags: one or more html tag strings to remove e.g. style, script
|
||||
|
||||
"""
|
||||
for tag in tags:
|
||||
found = doc.iterfind(".//" + tag)
|
||||
for n in found:
|
||||
LNODE.log(n, 1, "Dropping tag")
|
||||
n.drop_tree()
|
||||
return doc
|
||||
|
||||
|
||||
def is_bad_link(a_node):
|
||||
"""Helper to determine if the link is something to clean out
|
||||
|
||||
We've hit articles with many multiple links that should be cleaned out
|
||||
because they're just there to pollute the space. See tests for examples.
|
||||
|
||||
"""
|
||||
if a_node.tag == 'a':
|
||||
name = a_node.get('name')
|
||||
href = a_node.get('href')
|
||||
if name and not href:
|
||||
return True
|
||||
|
||||
if href:
|
||||
url_bits = href.split('#')
|
||||
if len(url_bits) == 2:
|
||||
if len(url_bits[1]) > 25:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def ok_embedded_video(node):
|
||||
"""Check if this embed/video is an ok one to count."""
|
||||
keep_keywords = ['youtube', 'blip.tv', 'vimeo']
|
||||
node_str = tounicode(node)
|
||||
for key in keep_keywords:
|
||||
if key in node_str:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def build_base_document(html, fragment=True):
|
||||
"""Return a base document with the body as root.
|
||||
|
||||
:param html: Parsed Element object
|
||||
:param fragment: Should we return a <div> doc fragment or a full <html>
|
||||
doc.
|
||||
|
||||
"""
|
||||
if html.tag == 'body':
|
||||
html.tag = 'div'
|
||||
found_body = html
|
||||
else:
|
||||
found_body = html.find('.//body')
|
||||
|
||||
if found_body is None:
|
||||
frag = fragment_fromstring('<div/>')
|
||||
frag.set('id', 'readabilityBody')
|
||||
frag.append(html)
|
||||
|
||||
if not fragment:
|
||||
output = fromstring(BASE_DOC)
|
||||
insert_point = output.find('.//body')
|
||||
insert_point.append(frag)
|
||||
else:
|
||||
output = frag
|
||||
else:
|
||||
|
||||
found_body.tag = 'div'
|
||||
found_body.set('id', 'readabilityBody')
|
||||
|
||||
if not fragment:
|
||||
output = fromstring(BASE_DOC)
|
||||
insert_point = output.find('.//body')
|
||||
insert_point.append(found_body)
|
||||
else:
|
||||
output = found_body
|
||||
|
||||
output.doctype = "<!DOCTYPE html>"
|
||||
return output
|
||||
|
||||
|
||||
def build_error_document(html, fragment=True):
|
||||
"""Return an empty erorr document with the body as root.
|
||||
|
||||
:param fragment: Should we return a <div> doc fragment or a full <html>
|
||||
doc.
|
||||
|
||||
"""
|
||||
frag = fragment_fromstring('<div/>')
|
||||
frag.set('id', 'readabilityBody')
|
||||
frag.set('class', 'parsing-error')
|
||||
|
||||
if not fragment:
|
||||
output = fromstring(BASE_DOC)
|
||||
insert_point = output.find('.//body')
|
||||
insert_point.append(frag)
|
||||
else:
|
||||
output = frag
|
||||
|
||||
output.doctype = "<!DOCTYPE html>"
|
||||
return output
|
||||
|
||||
|
||||
def transform_misused_divs_into_paragraphs(doc):
|
||||
"""Turn all divs that don't have children block level elements into p's
|
||||
|
||||
Since we can't change the tree as we iterate over it, we must do this
|
||||
before we process our document.
|
||||
|
||||
The idea is that we process all divs and if the div does not contain
|
||||
another list of divs, then we replace it with a p tag instead appending
|
||||
it's contents/children to it.
|
||||
|
||||
"""
|
||||
for elem in doc.iter(tag='div'):
|
||||
child_tags = [n.tag for n in elem.getchildren()]
|
||||
if 'div' not in child_tags:
|
||||
# if there is no div inside of this div...then it's a leaf
|
||||
# node in a sense.
|
||||
# We need to create a <p> and put all it's contents in there
|
||||
# We'll just stringify it, then regex replace the first/last
|
||||
# div bits to turn them into <p> vs <div>.
|
||||
LNODE.log(elem, 1, 'Turning leaf <div> into <p>')
|
||||
orig = tounicode(elem).strip()
|
||||
started = re.sub(r'^<\s*div', '<p', orig)
|
||||
ended = re.sub(r'div>$', 'p>', started)
|
||||
elem.getparent().replace(elem, fromstring(ended))
|
||||
return doc
|
||||
|
||||
|
||||
def check_siblings(candidate_node, candidate_list):
|
||||
"""Look through siblings for content that might also be related.
|
||||
|
||||
Things like preambles, content split by ads that we removed, etc.
|
||||
|
||||
"""
|
||||
candidate_css = candidate_node.node.get('class')
|
||||
potential_target = candidate_node.content_score * 0.2
|
||||
sibling_target_score = potential_target if potential_target > 10 else 10
|
||||
parent = candidate_node.node.getparent()
|
||||
siblings = parent.getchildren() if parent is not None else []
|
||||
|
||||
for sibling in siblings:
|
||||
append = False
|
||||
content_bonus = 0
|
||||
|
||||
if sibling is candidate_node.node:
|
||||
LNODE.log(sibling, 1, 'Sibling is the node so append')
|
||||
append = True
|
||||
|
||||
# Give a bonus if sibling nodes and top candidates have the example
|
||||
# same class name
|
||||
if candidate_css and sibling.get('class') == candidate_css:
|
||||
content_bonus += candidate_node.content_score * 0.2
|
||||
|
||||
if sibling in candidate_list:
|
||||
adjusted_score = candidate_list[sibling].content_score + \
|
||||
content_bonus
|
||||
|
||||
if adjusted_score >= sibling_target_score:
|
||||
append = True
|
||||
|
||||
if sibling.tag == 'p':
|
||||
link_density = get_link_density(sibling)
|
||||
content = sibling.text_content()
|
||||
content_length = len(content)
|
||||
|
||||
if content_length > 80 and link_density < 0.25:
|
||||
append = True
|
||||
elif content_length < 80 and link_density == 0:
|
||||
if ". " in content:
|
||||
append = True
|
||||
|
||||
if append:
|
||||
LNODE.log(sibling, 1, 'Sibling being appended')
|
||||
if sibling.tag not in ['div', 'p']:
|
||||
# We have a node that isn't a common block level element, like
|
||||
# a form or td tag. Turn it into a div so it doesn't get
|
||||
# filtered out later by accident.
|
||||
sibling.tag = 'div'
|
||||
|
||||
if candidate_node.node != sibling:
|
||||
candidate_node.node.append(sibling)
|
||||
|
||||
return candidate_node
|
||||
|
||||
|
||||
def clean_document(node):
|
||||
"""Clean up the final document we return as the readable article"""
|
||||
if node is None or len(node) == 0:
|
||||
return
|
||||
|
||||
LNODE.log(node, 2, "Processing doc")
|
||||
clean_list = ['object', 'h1']
|
||||
to_drop = []
|
||||
|
||||
# If there is only one h2, they are probably using it as a header and
|
||||
# not a subheader, so remove it since we already have a header.
|
||||
if len(node.findall('.//h2')) == 1:
|
||||
LOG.debug('Adding H2 to list of nodes to clean.')
|
||||
clean_list.append('h2')
|
||||
|
||||
for n in node.iter():
|
||||
LNODE.log(n, 2, "Cleaning iter node")
|
||||
# clean out any in-line style properties
|
||||
if 'style' in n.attrib:
|
||||
n.set('style', '')
|
||||
|
||||
# remove all of the following tags
|
||||
# Clean a node of all elements of type "tag".
|
||||
# (Unless it's a youtube/vimeo video. People love movies.)
|
||||
is_embed = True if n.tag in ['object', 'embed'] else False
|
||||
if n.tag in clean_list:
|
||||
allow = False
|
||||
|
||||
# Allow youtube and vimeo videos through as people usually
|
||||
# want to see those.
|
||||
if is_embed:
|
||||
if ok_embedded_video(n):
|
||||
allow = True
|
||||
|
||||
if not allow:
|
||||
LNODE.log(n, 2, "Dropping Node")
|
||||
to_drop.append(n)
|
||||
|
||||
if n.tag in ['h1', 'h2', 'h3', 'h4']:
|
||||
# clean headings
|
||||
# if the heading has no css weight or a high link density,
|
||||
# remove it
|
||||
if get_class_weight(n) < 0 or get_link_density(n) > .33:
|
||||
LNODE.log(n, 2, "Dropping <hX>, it's insignificant")
|
||||
to_drop.append(n)
|
||||
|
||||
# clean out extra <p>
|
||||
if n.tag == 'p':
|
||||
# if the p has no children and has no content...well then down
|
||||
# with it.
|
||||
if not n.getchildren() and len(n.text_content()) < 5:
|
||||
LNODE.log(n, 2, 'Dropping extra <p>')
|
||||
to_drop.append(n)
|
||||
|
||||
# finally try out the conditional cleaning of the target node
|
||||
if clean_conditionally(n):
|
||||
to_drop.append(n)
|
||||
|
||||
[n.drop_tree() for n in to_drop if n.getparent() is not None]
|
||||
return node
|
||||
|
||||
|
||||
def clean_conditionally(node):
|
||||
"""Remove the clean_el if it looks like bad content based on rules."""
|
||||
target_tags = ['form', 'table', 'ul', 'div', 'p']
|
||||
|
||||
LNODE.log(node, 2, 'Cleaning conditionally node.')
|
||||
|
||||
if node.tag not in target_tags:
|
||||
# this is not the tag you're looking for
|
||||
LNODE.log(node, 2, 'Node cleared.')
|
||||
return
|
||||
|
||||
weight = get_class_weight(node)
|
||||
# content_score = LOOK up the content score for this node we found
|
||||
# before else default to 0
|
||||
content_score = 0
|
||||
|
||||
if (weight + content_score < 0):
|
||||
LNODE.log(node, 2, 'Dropping conditional node')
|
||||
LNODE.log(node, 2, 'Weight + score < 0')
|
||||
return True
|
||||
|
||||
if node.text_content().count(',') < 10:
|
||||
LOG.debug("There aren't 10 ,s so we're processing more")
|
||||
|
||||
# If there are not very many commas, and the number of
|
||||
# non-paragraph elements is more than paragraphs or other ominous
|
||||
# signs, remove the element.
|
||||
p = len(node.findall('.//p'))
|
||||
img = len(node.findall('.//img'))
|
||||
li = len(node.findall('.//li')) - 100
|
||||
inputs = len(node.findall('.//input'))
|
||||
|
||||
embed = 0
|
||||
embeds = node.findall('.//embed')
|
||||
for e in embeds:
|
||||
if ok_embedded_video(e):
|
||||
embed += 1
|
||||
link_density = get_link_density(node)
|
||||
content_length = len(node.text_content())
|
||||
|
||||
remove_node = False
|
||||
|
||||
if li > p and node.tag != 'ul' and node.tag != 'ol':
|
||||
LNODE.log(node, 2, 'Conditional drop: li > p and not ul/ol')
|
||||
remove_node = True
|
||||
elif inputs > p / 3.0:
|
||||
LNODE.log(node, 2, 'Conditional drop: inputs > p/3.0')
|
||||
remove_node = True
|
||||
elif content_length < 25 and (img == 0 or img > 2):
|
||||
LNODE.log(node, 2,
|
||||
'Conditional drop: len < 25 and 0/>2 images')
|
||||
remove_node = True
|
||||
elif weight < 25 and link_density > 0.2:
|
||||
LNODE.log(node, 2,
|
||||
'Conditional drop: weight small and link is dense')
|
||||
remove_node = True
|
||||
elif weight >= 25 and link_density > 0.5:
|
||||
LNODE.log(node, 2,
|
||||
'Conditional drop: weight big but link heavy')
|
||||
remove_node = True
|
||||
elif (embed == 1 and content_length < 75) or embed > 1:
|
||||
LNODE.log(node, 2,
|
||||
'Conditional drop: embed w/o much content or many embed')
|
||||
remove_node = True
|
||||
|
||||
if remove_node:
|
||||
LNODE.log(node, 2, 'Node will be removed')
|
||||
else:
|
||||
LNODE.log(node, 2, 'Node cleared')
|
||||
return remove_node
|
||||
|
||||
# nope, don't remove anything
|
||||
LNODE.log(node, 2, 'Node Cleared final.')
|
||||
return False
|
||||
|
||||
|
||||
def prep_article(doc):
|
||||
"""Once we've found our target article we want to clean it up.
|
||||
|
||||
Clean out:
|
||||
- inline styles
|
||||
- forms
|
||||
- strip empty <p>
|
||||
- extra tags
|
||||
|
||||
"""
|
||||
doc = clean_document(doc)
|
||||
return doc
|
||||
|
||||
|
||||
def find_candidates(doc):
|
||||
"""Find cadidate nodes for the readable version of the article.
|
||||
|
||||
Here's we're going to remove unlikely nodes, find scores on the rest, and
|
||||
clean up and return the final best match.
|
||||
|
||||
"""
|
||||
scorable_node_tags = SCORABLE_TAGS
|
||||
nodes_to_score = []
|
||||
should_remove = []
|
||||
|
||||
for node in doc.iter():
|
||||
if is_unlikely_node(node):
|
||||
LOG.debug('We should drop unlikely: ' + str(node))
|
||||
should_remove.append(node)
|
||||
continue
|
||||
if node.tag == 'a' and is_bad_link(node):
|
||||
LOG.debug('We should drop bad link: ' + str(node))
|
||||
should_remove.append(node)
|
||||
continue
|
||||
if node.tag in scorable_node_tags and node not in nodes_to_score:
|
||||
nodes_to_score.append(node)
|
||||
return score_candidates(nodes_to_score), should_remove
|
||||
|
||||
|
||||
class Article(object):
|
||||
"""Parsed readable object"""
|
||||
_should_drop = []
|
||||
|
||||
def __init__(self, html, url=None, fragment=True):
|
||||
"""Create the Article we're going to use.
|
||||
|
||||
:param html: The string of html we're going to parse.
|
||||
:param url: The url so we can adjust the links to still work.
|
||||
:param fragment: Should we return a <div> fragment or a full <html>
|
||||
doc.
|
||||
|
||||
"""
|
||||
LOG.debug('Url: ' + str(url))
|
||||
self.orig = OriginalDocument(html, url=url)
|
||||
self.fragment = fragment
|
||||
|
||||
def __str__(self):
|
||||
return tostring(self._readable)
|
||||
|
||||
def __unicode__(self):
|
||||
return tounicode(self._readable)
|
||||
|
||||
@cached_property(ttl=600)
|
||||
def doc(self):
|
||||
"""The doc is the parsed xml tree of the given html."""
|
||||
try:
|
||||
doc = self.orig.html
|
||||
# cleaning doesn't return, just wipes in place
|
||||
html_cleaner(doc)
|
||||
doc = drop_tag(doc, 'noscript', 'iframe')
|
||||
doc = transform_misused_divs_into_paragraphs(doc)
|
||||
return doc
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
@cached_property(ttl=600)
|
||||
def candidates(self):
|
||||
"""Generate the list of candidates from the doc."""
|
||||
doc = self.doc
|
||||
if doc is not None and len(doc):
|
||||
candidates, should_drop = find_candidates(doc)
|
||||
self._should_drop = should_drop
|
||||
return candidates
|
||||
else:
|
||||
return None
|
||||
|
||||
@cached_property(ttl=600)
|
||||
def readable(self):
|
||||
return tounicode(self._readable)
|
||||
|
||||
@cached_property(ttl=600)
|
||||
def _readable(self):
|
||||
"""The readable parsed article"""
|
||||
if self.candidates:
|
||||
LOG.debug('Candidates found:')
|
||||
pp = PrettyPrinter(indent=2)
|
||||
|
||||
# cleanup by removing the should_drop we spotted.
|
||||
[n.drop_tree() for n in self._should_drop
|
||||
if n.getparent() is not None]
|
||||
|
||||
# right now we return the highest scoring candidate content
|
||||
by_score = sorted([c for c in self.candidates.values()],
|
||||
key=attrgetter('content_score'), reverse=True)
|
||||
LOG.debug(pp.pformat(by_score))
|
||||
|
||||
# since we have several candidates, check the winner's siblings
|
||||
# for extra content
|
||||
winner = by_score[0]
|
||||
LOG.debug('Selected winning node: ' + str(winner))
|
||||
updated_winner = check_siblings(winner, self.candidates)
|
||||
LOG.debug('Begin final prep of article')
|
||||
updated_winner.node = prep_article(updated_winner.node)
|
||||
if updated_winner.node is not None:
|
||||
doc = build_base_document(updated_winner.node, self.fragment)
|
||||
else:
|
||||
LOG.warning('Had candidates but failed to find a cleaned winning doc.')
|
||||
doc = self._handle_no_candidates()
|
||||
else:
|
||||
LOG.warning('No candidates found: using document.')
|
||||
LOG.debug('Begin final prep of article')
|
||||
doc = self._handle_no_candidates()
|
||||
|
||||
return doc
|
||||
|
||||
def _handle_no_candidates(self):
|
||||
"""If we fail to find a good candidate we need to find something else."""
|
||||
# since we've not found a good candidate we're should help this
|
||||
if self.doc is not None and len(self.doc):
|
||||
# cleanup by removing the should_drop we spotted.
|
||||
[n.drop_tree() for n in self._should_drop
|
||||
if n.getparent() is not None]
|
||||
doc = prep_article(self.doc)
|
||||
doc = build_base_document(doc, self.fragment)
|
||||
else:
|
||||
LOG.warning('No document to use.')
|
||||
doc = build_error_document(self.fragment)
|
||||
|
||||
return doc
|
@ -1,237 +0,0 @@
|
||||
"""Handle dealing with scoring nodes and content for our parsing."""
|
||||
import re
|
||||
from hashlib import md5
|
||||
from lxml.etree import tounicode
|
||||
|
||||
from breadability.logconfig import LNODE
|
||||
from breadability.logconfig import LOG
|
||||
|
||||
# A series of sets of attributes we check to help in determining if a node is
|
||||
# a potential candidate or not.
|
||||
CLS_UNLIKELY = re.compile(('combx|comment|community|disqus|extra|foot|header|'
|
||||
'menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|'
|
||||
'pager|perma|popup|tweet|twitter'), re.I)
|
||||
CLS_MAYBE = re.compile('and|article|body|column|main|shadow', re.I)
|
||||
CLS_WEIGHT_POSITIVE = re.compile(('article|body|content|entry|hentry|main|'
|
||||
'page|pagination|post|text|blog|story'), re.I)
|
||||
CLS_WEIGHT_NEGATIVE = re.compile(('combx|comment|com-|contact|foot|footer|'
|
||||
'footnote|head|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|'
|
||||
'sidebar|sponsor|shopping|tags|tool|widget'), re.I)
|
||||
|
||||
|
||||
def check_node_attr(node, attr, checkset):
|
||||
value = node.get(attr) or ""
|
||||
check = checkset.search(value)
|
||||
if check:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def generate_hash_id(node):
|
||||
"""Generate a hash_id for the node in question.
|
||||
|
||||
:param node: lxml etree node
|
||||
|
||||
"""
|
||||
content = tounicode(node)
|
||||
hashed = md5()
|
||||
try:
|
||||
hashed.update(content.encode('utf-8', "replace"))
|
||||
except Exception, e:
|
||||
LOG.error("BOOM! " + str(e))
|
||||
|
||||
return hashed.hexdigest()[0:8]
|
||||
|
||||
|
||||
def get_link_density(node, node_text=None):
|
||||
"""Generate a value for the number of links in the node.
|
||||
|
||||
:param node: pared elementree node
|
||||
:param node_text: if we already have the text_content() make this easier
|
||||
on us.
|
||||
:returns float:
|
||||
|
||||
"""
|
||||
link_length = sum([len(a.text_content()) or 0
|
||||
for a in node.findall(".//a")])
|
||||
# For each img, give 50 bonus chars worth of length.
|
||||
# Tweaking this 50 down a notch should help if we hit false positives.
|
||||
link_length = max(link_length -
|
||||
sum([50 for img in node.findall(".//img")]), 0)
|
||||
if node_text:
|
||||
text_length = len(node_text)
|
||||
else:
|
||||
text_length = len(node.text_content())
|
||||
return float(link_length) / max(text_length, 1)
|
||||
|
||||
|
||||
def get_class_weight(node):
|
||||
"""Get an elements class/id weight.
|
||||
|
||||
We're using sets to help efficiently check for existence of matches.
|
||||
|
||||
"""
|
||||
weight = 0
|
||||
if check_node_attr(node, 'class', CLS_WEIGHT_NEGATIVE):
|
||||
weight = weight - 25
|
||||
if check_node_attr(node, 'class', CLS_WEIGHT_POSITIVE):
|
||||
weight = weight + 25
|
||||
|
||||
if check_node_attr(node, 'id', CLS_WEIGHT_NEGATIVE):
|
||||
weight = weight - 25
|
||||
if check_node_attr(node, 'id', CLS_WEIGHT_POSITIVE):
|
||||
weight = weight + 25
|
||||
|
||||
return weight
|
||||
|
||||
|
||||
def is_unlikely_node(node):
|
||||
"""Short helper for checking unlikely status.
|
||||
|
||||
If the class or id are in the unlikely list, and there's not also a
|
||||
class/id in the likely list then it might need to be removed.
|
||||
|
||||
"""
|
||||
unlikely = check_node_attr(node, 'class', CLS_UNLIKELY) or \
|
||||
check_node_attr(node, 'id', CLS_UNLIKELY)
|
||||
|
||||
maybe = check_node_attr(node, 'class', CLS_MAYBE) or \
|
||||
check_node_attr(node, 'id', CLS_MAYBE)
|
||||
|
||||
if unlikely and not maybe and node.tag != 'body':
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def score_candidates(nodes):
|
||||
"""Given a list of potential nodes, find some initial scores to start"""
|
||||
MIN_HIT_LENTH = 25
|
||||
candidates = {}
|
||||
|
||||
for node in nodes:
|
||||
LNODE.log(node, 1, "Scoring Node")
|
||||
|
||||
content_score = 0
|
||||
# if the node has no parent it knows of, then it ends up creating a
|
||||
# body and html tag to parent the html fragment.
|
||||
parent = node.getparent()
|
||||
grand = parent.getparent() if parent is not None else None
|
||||
innertext = node.text_content()
|
||||
|
||||
if parent is None or grand is None:
|
||||
LNODE.log(
|
||||
node, 1,
|
||||
"Skipping candidate because parent/grand are none")
|
||||
continue
|
||||
|
||||
# If this paragraph is less than 25 characters, don't even count it.
|
||||
if innertext and len(innertext) < MIN_HIT_LENTH:
|
||||
LNODE.log(
|
||||
node, 1,
|
||||
"Skipping candidate because not enough content.")
|
||||
continue
|
||||
|
||||
# Initialize readability data for the parent.
|
||||
# if the parent node isn't in the candidate list, add it
|
||||
if parent not in candidates:
|
||||
candidates[parent] = ScoredNode(parent)
|
||||
|
||||
if grand not in candidates:
|
||||
candidates[grand] = ScoredNode(grand)
|
||||
|
||||
# Add a point for the paragraph itself as a base.
|
||||
content_score += 1
|
||||
|
||||
if innertext:
|
||||
# Add 0.25 points for any commas within this paragraph
|
||||
content_score += innertext.count(',') * 0.25
|
||||
LNODE.log(node, 1,
|
||||
"Bonus points for ,: " + str(innertext.count(',')))
|
||||
|
||||
# Subtract 0.5 points for each double quote within this paragraph
|
||||
content_score += innertext.count('"') * (-0.5)
|
||||
LNODE.log(node, 1,
|
||||
'Penalty points for ": ' + str(innertext.count('"')))
|
||||
|
||||
# For every 100 characters in this paragraph, add another point.
|
||||
# Up to 3 points.
|
||||
length_points = len(innertext) / 100
|
||||
|
||||
if length_points > 3:
|
||||
content_score += 3
|
||||
else:
|
||||
content_score += length_points
|
||||
LNODE.log(
|
||||
node, 1,
|
||||
"Length/content points: {0} : {1}".format(length_points,
|
||||
content_score))
|
||||
|
||||
# Add the score to the parent.
|
||||
LNODE.log(node, 1, "From this current node.")
|
||||
candidates[parent].content_score += content_score
|
||||
LNODE.log(
|
||||
candidates[parent].node,
|
||||
1,
|
||||
"Giving parent bonus points: " + str(
|
||||
candidates[parent].content_score))
|
||||
# The grandparent gets half.
|
||||
LNODE.log(candidates[grand].node, 1, "Giving grand bonus points")
|
||||
candidates[grand].content_score += (content_score / 2.0)
|
||||
LNODE.log(
|
||||
candidates[parent].node,
|
||||
1,
|
||||
"Giving grand bonus points: " + str(
|
||||
candidates[grand].content_score))
|
||||
|
||||
for candidate in candidates.values():
|
||||
adjustment = 1 - get_link_density(candidate.node)
|
||||
LNODE.log(
|
||||
candidate.node,
|
||||
1,
|
||||
"Getting link density adjustment: {0} * {1} ".format(
|
||||
candidate.content_score, adjustment))
|
||||
candidate.content_score = candidate.content_score * (adjustment)
|
||||
|
||||
return candidates
|
||||
|
||||
|
||||
class ScoredNode(object):
|
||||
"""We need Scored nodes we use to track possible article matches
|
||||
|
||||
We might have a bunch of these so we use __slots__ to keep memory usage
|
||||
down.
|
||||
|
||||
"""
|
||||
__slots__ = ['node', 'content_score']
|
||||
|
||||
def __repr__(self):
|
||||
"""Helpful representation of our Scored Node"""
|
||||
return "{0}: {1:0.1F}\t{2}".format(
|
||||
self.hash_id,
|
||||
self.content_score,
|
||||
self.node)
|
||||
|
||||
def __init__(self, node):
|
||||
"""Given node, set an initial score and weigh based on css and id"""
|
||||
self.node = node
|
||||
content_score = 0
|
||||
if node.tag in ['div', 'article']:
|
||||
content_score = 5
|
||||
|
||||
if node.tag in ['pre', 'td', 'blockquote']:
|
||||
content_score = 3
|
||||
|
||||
if node.tag in ['address', 'ol', 'ul', 'dl', 'dd', 'dt', 'li',
|
||||
'form']:
|
||||
content_score = -3
|
||||
if node.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th']:
|
||||
content_score = -5
|
||||
|
||||
content_score += get_class_weight(node)
|
||||
self.content_score = content_score
|
||||
|
||||
@property
|
||||
def hash_id(self):
|
||||
return generate_hash_id(self.node)
|
Loading…
Reference in New Issue