Prefer JSON-LD metadata object, when present (#609)

* Prefer JSON-LD metadata object, when present

* Log JSON-LD parsing error

* Trim all JSON-LD fields
pull/618/head
Dan Burzo 4 years ago committed by GitHub
parent 914307a90b
commit 2ca98284e9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -53,6 +53,7 @@ function Readability(doc, options) {
this._serializer = options.serializer || function(el) {
return el.innerHTML;
};
this._disableJSONLD = !!options.disableJSONLD;
// Start with all flags set
this._flags = this.FLAG_STRIP_UNLIKELYS |
@ -135,7 +136,9 @@ Readability.prototype = {
whitespace: /^\s*$/,
hasContent: /\S$/,
srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g,
b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i
b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i,
// See: https://schema.org/Article
jsonLdArticleTypes: /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/
},
DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ],
@ -244,6 +247,21 @@ Readability.prototype = {
Array.prototype.forEach.call(nodeList, fn, this);
},
/**
* Iterate over a NodeList, and return the first node that passes
* the supplied test function
*
* For convenience, the current object context is applied to the provided
* test function.
*
* @param NodeList nodeList The NodeList.
* @param Function fn The test function.
* @return void
*/
_findNode: function(nodeList, fn) {
return Array.prototype.find.call(nodeList, fn, this);
},
/**
* Iterate over a NodeList, return true if any of the provided iterate
* function calls returns true, false otherwise.
@ -1292,12 +1310,80 @@ Readability.prototype = {
});
},
/**
* Try to extract metadata from JSON-LD object.
* For now, only Schema.org objects of type Article or its subtypes are supported.
* @return Object with any metadata that could be extracted (possibly none)
*/
_getJSONLD: function (doc) {
var scripts = this._getAllNodesWithTag(doc, ["script"]);
var jsonLdElement = this._findNode(scripts, function(el) {
return el.getAttribute("type") === "application/ld+json";
});
if (jsonLdElement) {
try {
// Strip CDATA markers if present
var content = jsonLdElement.textContent.replace(/^\s*<!\[CDATA\[|\]\]>\s*$/g, "");
var parsed = JSON.parse(content);
var metadata = {};
if (
!parsed["@context"] ||
!parsed["@context"].match(/^https?\:\/\/schema\.org$/)
) {
return metadata;
}
if (!parsed["@type"] && Array.isArray(parsed["@graph"])) {
parsed = parsed["@graph"].find(function(it) {
return (it["@type"] || "").match(
this.REGEXPS.jsonLdArticleTypes
);
});
}
if (
!parsed ||
!parsed["@type"] ||
!parsed["@type"].match(this.REGEXPS.jsonLdArticleTypes)
) {
return metadata;
}
if (typeof parsed.name === "string") {
metadata.title = parsed.name.trim();
} else if (typeof parsed.headline === "string") {
metadata.title = parsed.headline.trim();
}
if (parsed.author && typeof parsed.author.name === "string") {
metadata.byline = parsed.author.name.trim();
}
if (typeof parsed.description === "string") {
metadata.excerpt = parsed.description.trim();
}
if (
parsed.publisher &&
typeof parsed.publisher.name === "string"
) {
metadata.siteName = parsed.publisher.name.trim();
}
return metadata;
} catch (err) {
this.log(err.message);
}
}
return {};
},
/**
* Attempts to get excerpt and byline metadata for the article.
*
* @param {Object} jsonld object containing any metadata that
* could be extracted from JSON-LD object.
*
* @return Object with optional "excerpt" and "byline" properties
*/
_getArticleMetadata: function() {
_getArticleMetadata: function(jsonld) {
var metadata = {};
var values = {};
var metaElements = this._doc.getElementsByTagName("meta");
@ -1343,7 +1429,8 @@ Readability.prototype = {
});
// get title
metadata.title = values["dc:title"] ||
metadata.title = jsonld.title ||
values["dc:title"] ||
values["dcterm:title"] ||
values["og:title"] ||
values["weibo:article:title"] ||
@ -1356,12 +1443,14 @@ Readability.prototype = {
}
// get author
metadata.byline = values["dc:creator"] ||
metadata.byline = jsonld.byline ||
values["dc:creator"] ||
values["dcterm:creator"] ||
values["author"];
// get description
metadata.excerpt = values["dc:description"] ||
metadata.excerpt = jsonld.excerpt ||
values["dc:description"] ||
values["dcterm:description"] ||
values["og:description"] ||
values["weibo:article:description"] ||
@ -1370,7 +1459,8 @@ Readability.prototype = {
values["twitter:description"];
// get site name
metadata.siteName = values["og:site_name"];
metadata.siteName = jsonld.siteName ||
values["og:site_name"];
// in many sites the meta value is escaped with HTML entities,
// so here we need to unescape it
@ -2029,12 +2119,15 @@ Readability.prototype = {
// Unwrap image from noscript
this._unwrapNoscriptImages(this._doc);
// Extract JSON-LD metadata before removing scripts
var jsonLd = this._disableJSONLD ? {} : this._getJSONLD(this._doc);
// Remove script tags from the document.
this._removeScripts(this._doc);
this._prepDocument();
var metadata = this._getArticleMetadata();
var metadata = this._getArticleMetadata(jsonLd);
this._articleTitle = metadata.title;
var articleContent = this._grabArticle();

@ -2,7 +2,7 @@
"title": "Facebook Is Tracking Me Even Though Im Not on Facebook",
"byline": "By Daniel Kahn Gillmor, Senior Staff Technologist, ACLU Speech, Privacy, and Technology Project",
"dir": "ltr",
"excerpt": "I don't use Facebook. I'm not technophobic — I'm a geek. I've been using email since the early 1990s, I have accounts on hundreds of services around the net, and I do software development and internet protocol design both for work and for fun. I believe that a globe-spanning communications network like the internet can be a positive social force, and I publish much of my own work on the open web.",
"excerpt": "Facebook collects data about people who have never even opted in. But there are ways these non-users can protect themselves.",
"readerable": true,
"siteName": "American Civil Liberties Union"
}

@ -1,5 +1,5 @@
{
"title": "Obama admits US gun laws are his 'biggest frustration' - BBC News",
"title": "Obama admits US gun laws are his 'biggest frustration'",
"byline": null,
"excerpt": "President Barack Obama tells the BBC his failure to pass \"common sense gun safety laws\" is the greatest frustration of his presidency.",
"readerable": true,

@ -2,7 +2,7 @@
"title": "Tite diz que errou ao levar taça da Libertadores a Lula em 2012",
"byline": "21.dez.2018 às 10h55",
"dir": null,
"excerpt": "Após rechaçar um encontro da seleção brasileira com o presidente eleito Jair Bolsonaro, o técnico Tite declarou que errou ao levar a taça da Copa Libertadores de 2012, conquistada pelo Corinthians, ao ex-presidente Luiz Inácio Lula da Silva.",
"excerpt": "Na ocasião, técnico do Corinthians entregou réplica do troféu ao ex-presidente",
"siteName": "Folha de S.Paulo",
"readerable": true
}

@ -204,7 +204,7 @@
"description": "Na ocasião, técnico do Corinthians entregou réplica do troféu ao ex-presidente",
"datePublished": "2018-12-21T12:55:00Z",
"image": { "@type": "ImageObject", "url": "https://f.i.uol.com.br/fotografia/2018/12/21/15454034955c1cfc67131dc_1545403495_3x2_md.jpg", "width": "768", "height": "512" }
"image": { "@type": "ImageObject", "url": "https://f.i.uol.com.br/fotografia/2018/12/21/15454034955c1cfc67131dc_1545403495_3x2_md.jpg", "width": "768", "height": "512" },
"contentLocation": {
"@type": "Place",

@ -2,7 +2,7 @@
"title": "Node.js and CPU profiling on production (in real-time without downtime)",
"byline": "Vincent Vallet",
"dir": null,
"excerpt": "Why CPU monitoring is important?",
"siteName": "Medium",
"excerpt": "How to run a CPU profiling with Node.js on your production in real-time and without interruption of service.",
"siteName": "Voodoo Engineering",
"readerable": true
}

@ -1,8 +1,8 @@
{
"title": "The 21 best movies of 2017",
"byline": "By Alissa Wilkinson@alissamarie\n Updated Jul 24, 2018, 2:15pm EDT",
"title": "How to watch the 21 best films of 2017",
"byline": "Alissa Wilkinson",
"dir": null,
"excerpt": "How to watch the greatest movies of the year, from Lady Bird and Dunkirk to Get Out and The Big Sick.",
"excerpt": "It was an extraordinary year for movies.",
"siteName": "Vox",
"readerable": true
}

@ -2,7 +2,7 @@
"title": "Screenshot : «Vape Wave», «6 Days», «Alphonse Président»…",
"byline": "Par Alexandre Hervaud et Jérémy Piette",
"dir": null,
"excerpt": "Séries, documentaires, programmes jeunesse… Retrouvez les recommandations de Libération pour savoir quoi regarder sur vos écrans cette semaine. Pour dépasser...",
"siteName": "Libération.fr",
"excerpt": "Séries, documentaires, programmes jeunesse… Retrouvez les recommandations de Libération pour savoir quoi regarder sur vos écrans cette semaine.\nPour dépasser...",
"siteName": "Libération",
"readerable": true
}

@ -1,8 +1,8 @@
{
"title": "New Zealand - Wikipedia",
"byline": "Authority control",
"title": "New Zealand",
"byline": "Contributors to Wikimedia projects",
"dir": "ltr",
"excerpt": "Coordinates: 42°S 174°E / 42°S 174°E",
"siteName": null,
"siteName": "Wikimedia Foundation, Inc.",
"readerable": true
}

@ -1,8 +1,8 @@
{
"title": "Hermitian matrix - Wikipedia",
"byline": null,
"title": "Hermitian matrix",
"byline": "Contributors to Wikimedia projects",
"dir": "ltr",
"excerpt": "In mathematics, a Hermitian matrix (or self-adjoint matrix) is a complex square matrix that is equal to its own conjugate transpose—that is, the element in the i-th row and j-th column is equal to the complex conjugate of the element in the j-th row and i-th column, for all indices i and j:",
"siteName": null,
"siteName": "Wikimedia Foundation, Inc.",
"readerable": true
}

@ -157,23 +157,23 @@ function runTestsWithItems(label, domGenerationFn, source, expectedContent, expe
});
it("should extract expected title", function() {
expect(expectedMetadata.title).eql(result.title);
expect(result.title).eql(expectedMetadata.title);
});
it("should extract expected byline", function() {
expect(expectedMetadata.byline).eql(result.byline);
expect(result.byline).eql(expectedMetadata.byline);
});
it("should extract expected excerpt", function() {
expect(expectedMetadata.excerpt).eql(result.excerpt);
expect(result.excerpt).eql(expectedMetadata.excerpt);
});
it("should extract expected site name", function() {
expect(expectedMetadata.siteName).eql(result.siteName);
expect(result.siteName).eql(expectedMetadata.siteName);
});
expectedMetadata.dir && it("should extract expected direction", function() {
expect(expectedMetadata.dir).eql(result.dir);
expect(result.dir).eql(expectedMetadata.dir);
});
});
}

Loading…
Cancel
Save