Fix tests to pass again

pull/4/merge
Richard Harding 12 years ago
parent 8f28e7c947
commit 60ab4a96b0

@ -69,7 +69,12 @@ def transform_misused_divs_into_paragraphs(doc):
def process(doc):
"""Process this doc to make it readable."""
"""Process this doc to make it readable.
Here's we're going to remove unlikely nodes, find scores on the rest, and
clean up and return the final best match.
"""
unlikely = []
scorable_node_tags = ['p', 'td', 'pre']
nodes_to_score = []
@ -92,14 +97,8 @@ def process(doc):
# process our clean up instructions
[n.drop_tree() for n in unlikely]
return doc
# def transform_misused_divs_into_paragraphs(self):
# for elem in self.html.iter():
# if elem.tag.lower() == "div":
# # transform <div>s that do not contain other block elements into <p>s
# if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))):
# self.debug("Altering div(#%s.%s) to p" % (elem.get('id', ''), elem.get('class', '')))
# elem.tag = "p"
class Article(object):
"""Parsed readable object"""
@ -114,6 +113,7 @@ class Article(object):
doc = build_base_document(doc)
doc = drop_tag(doc, 'script', 'link', 'style', 'noscript')
doc = transform_misused_divs_into_paragraphs(doc)
doc = process(doc)
return doc

@ -16,10 +16,10 @@
<div id="header">Gone</div>
<!-- These have bad and good terms so should stay -->
<p id="mainfoot">Gone</div>
<p id="harticleeader">Gone</div>
<p class="article header">Gone</div>
<p class="column header">Gone</div>
<div id="mainfoot">Gone</div>
<div id="harticleeader">Gone</div>
<div class="article header">Gone</div>
<div class="column header">Gone</div>
<!-- And this will stick around for final -->
<div>Final content.</div>

Loading…
Cancel
Save