Merge pull request #347 from evanxd/bug-1259763

Bug 1259763 - Remove h2 when there is only one h2 and its text content substantially equals article title, r=Gijs
pull/298/merge
Evan Tseng 7 years ago committed by GitHub
commit 498a7b2bf6

@ -32,6 +32,7 @@ function Readability(uri, doc, options) {
this._uri = uri;
this._doc = doc;
this._biggestFrame = false;
this._articleTitle = null;
this._articleByline = null;
this._articleDir = null;
@ -482,10 +483,18 @@ Readability.prototype = {
this._cleanMatchedNodes(topCandidate, /share/);
});
// If there is only one h2, they are probably using it as a header
// and not a subheader, so remove it since we already have a header.
if (articleContent.getElementsByTagName('h2').length === 1)
this._clean(articleContent, "h2");
// If there is only one h2 and its text content substantially equals article title,
// they are probably using it as a header and not a subheader,
// so remove it since we already extract the title separately.
var h2 = articleContent.getElementsByTagName('h2');
if (h2.length === 1) {
var lengthSimilarRate = (h2[0].textContent.length - this._articleTitle.length) / this._articleTitle.length;
if (Math.abs(lengthSimilarRate) < 0.5 &&
(lengthSimilarRate > 0 ? h2[0].textContent.includes(this._articleTitle) :
this._articleTitle.includes(h2[0].textContent))) {
this._clean(articleContent, "h2");
}
}
this._clean(articleContent, "iframe");
this._clean(articleContent, "input");
@ -1920,7 +1929,7 @@ Readability.prototype = {
this._prepDocument();
var metadata = this._getArticleMetadata();
var articleTitle = metadata.title;
this._articleTitle = metadata.title;
var articleContent = this._grabArticle();
if (!articleContent)
@ -1951,7 +1960,7 @@ Readability.prototype = {
var textContent = articleContent.textContent;
return {
uri: this._uri,
title: articleTitle,
title: this._articleTitle,
byline: metadata.byline || this._articleByline,
dir: this._articleDir,
content: articleContent.innerHTML,

@ -16,6 +16,7 @@
<p><img src="http://fakehost/foo/bar/baz.png"/></p>
<p><img src="http://test/foo/bar/baz.png"/></p>
<p><img src="https://test/foo/bar/baz.png"/></p>
<h2>Foo</h2>
<p> Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. </p>
</article>
</div>

@ -0,0 +1,7 @@
{
"title": "The 'birth lottery' and economic mobility",
"byline": "Ahiza Garcia",
"dir": null,
"excerpt": "A recently-released report on poverty and inequality found that the U.S. ranks the lowest among countries with welfare states.",
"readerable": true
}

@ -0,0 +1,56 @@
<div id="readability-page-1" class="page">
<div id="storytext">
<div id="js-ie-storytop" class="ie--storytop">
<div class="cnnplayer fade-in" id="cnnplayer_cvp_story_0">
<div class="cnnVidplayer">
<div class="summaryImg" id="vid0" href="/video/news/2015/11/30/homeboy-industries-priest.cnnmoney" onclick="javascript:VideoPlayerManager.playVideos('cvp_story_0'); return false;"><video id="cvp_story_0" preload="metadata" poster="" src="http://ht3.cdn.turner.com/money/big/news/2015/11/30/homeboy-industries-priest.cnnmoney_1024x576.mp4" controls="controls" width="300" height="169"></video>
<div id="cvp_story_0_endSlate" class="video-posterboard end-slate">
<div class="video-slate-wrapper">
<div class="video-bg">
<img src="" alt="" width="620" height="348" /></div>
</div>
</div>
</div>
</div>
</div>
</div>
<h2>The U.S. has long been heralded as a land of opportunity -- a place where anyone can succeed regardless of the economic class they were born into.</h2>
<p> But a new report released on Monday by <a href="http://web.stanford.edu/group/scspi-dev/cgi-bin/" target="_blank">Stanford University's Center on Poverty and Inequality</a> calls that into question. </p>
<p> The report assessed poverty levels, income and wealth inequality, economic mobility and unemployment levels among 10 wealthy countries with social welfare programs. </p>
<div id="smartassetcontainer" class="module">
<div class="module">
<div class="module-body">
<div id="smartasset-article" class="collapsible">
<div>
<p class="cnnhdr">
Powered by SmartAsset.com
</p>
<img src="https://smrt.as/ck" />
</div>
</div>
</div>
</div>
</div>
<p> Among its key findings: the class you're born into matters much more in the U.S. than many of the other countries. </p>
<p> As the <a href="http://web.stanford.edu/group/scspi-dev/cgi-bin/publications/state-union-report" target="_blank">report states</a>: "[T]he birth lottery matters more in the U.S. than in most well-off countries." </p>
<p> But this wasn't the only finding that suggests the U.S. isn't quite living up to its reputation as a country where everyone has an equal chance to get ahead through sheer will and hard work. </p>
<p> <a href="http://money.cnn.com/2016/01/11/news/economy/rich-taxes/index.html?iid=EL"><span class="inStoryHeading">Related: Rich are paying more in taxes but not as much as they used to</span></a> </p>
<p> The report also suggested the U.S. might not be the "jobs machine" it thinks it is, when compared to other countries. </p>
<p> It ranked near the bottom of the pack based on the levels of unemployment among men and women of prime working age. The study determined this by taking the ratio of employed men and women between the ages of 25 and 54 compared to the total population of each country. </p>
<p> The overall rankings of the countries were as follows:<span> <br/>1. Finland <span> <br/>2. Norway<span> <br/>3. Australia <span> <br/>4. Canada<span> <br/>5. Germany<span> <br/>6. France<span> <br/>7. United Kingdom <span> <br/>8. Italy<span> <br/>9. Spain<span> <br/>10. United States </span></span>
</span>
</span>
</span>
</span>
</span>
</span>
</span>
</span>
</p>
<p> The low ranking the U.S. received was due to its extreme levels of wealth and income inequality and the ineffectiveness of its "safety net" -- social programs aimed at reducing poverty. </p>
<p> <a href="http://money.cnn.com/2016/01/05/news/economy/chicago-segregated/index.html?iid=EL"><span class="inStoryHeading">Related: Chicago is America's most segregated city</span></a> </p>
<p> The report concluded that the American safety net was ineffective because it provides only half the financial help people need. Additionally, the levels of assistance in the U.S. are generally lower than in other countries. </p>
<p class="storytimestamp"> <span class="cnnStorySource"> CNNMoney (New York) </span> <span class="cnnDateStamp">First published February 1, 2016: 1:28 AM ET</span> </p>
</div>
</div>

File diff suppressed because it is too large Load Diff

@ -1,5 +1,6 @@
<div id="readability-page-1" class="page">
<div class="section-inner layoutSingleColumn">
<h2 name="3c62" id="3c62" data-align="center" class="graf--h2">Open Journalism Project:</h2>
<h4 name="425a" id="425a" data-align="center" class="graf--h4"><em class="markup--em markup--h4-em">Better Student Journalism</em></h4>
<h4 name="08db" id="08db" class="graf--h4 graf--empty"><br/></h4>
<p name="d178" id="d178" class="graf--p">We pushed out the first version of the <a href="http://pippinlee.github.io/open-journalism-project/" data-href="http://pippinlee.github.io/open-journalism-project/" class="markup--anchor markup--p-anchor" rel="nofollow">Open Journalism site</a> in January. Our goal is for the site to be a place to teach students what they should know about journalism on the web. It should be fun too.</p>

@ -1,6 +1,7 @@
<div id="readability-page-1" class="page">
<article>
<p> Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tab here incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. </p>
<h2>Foo</h2>
<p> Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. </p>
</article>
</div>

@ -1,6 +1,7 @@
<div id="readability-page-1" class="page">
<article>
<p> <span face="Arial" size="2">Lorem ipsum dolor</span> sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. <span face="Arial" size="2">Duis</span> aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. </p>
<h2>Foo</h2>
<p> Tempor incididunt ut labore et <span face="Arial" size="2">dolore</span> magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. <span face="Arial" size="2">Excepteur sint occaecat</span> cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. </p>
</article>
</div>

@ -1,6 +1,7 @@
<div id="readability-page-1" class="page">
<article>
<p> Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. </p>
<h2>Foo</h2>
<p> Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. </p>
</article>
</div>

@ -1,6 +1,7 @@
<div id="readability-page-1" class="page">
<div class="post single-post" id="post-2015_02_26_lupita-nyongo-pearl-dress-stolen-oscars">
<p class="headline">
<h2 class="hf1">Lupita Nyong'o</h2>
<h4 class="hf2">$150K Pearl Oscar Dress ... STOLEN!!!!</h4> </p>
<h5 class="article-posted-date">
2/26/2015 7:11 AM PST BY TMZ STAFF

Loading…
Cancel
Save