diff --git a/Readability.js b/Readability.js index 00f6cc2..62c76a9 100644 --- a/Readability.js +++ b/Readability.js @@ -53,6 +53,7 @@ function Readability(doc, options) { this._serializer = options.serializer || function(el) { return el.innerHTML; }; + this._disableJSONLD = !!options.disableJSONLD; // Start with all flags set this._flags = this.FLAG_STRIP_UNLIKELYS | @@ -135,7 +136,9 @@ Readability.prototype = { whitespace: /^\s*$/, hasContent: /\S$/, srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g, - b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i + b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i, + // See: https://schema.org/Article + jsonLdArticleTypes: /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/ }, DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ], @@ -244,6 +247,21 @@ Readability.prototype = { Array.prototype.forEach.call(nodeList, fn, this); }, + /** + * Iterate over a NodeList, and return the first node that passes + * the supplied test function + * + * For convenience, the current object context is applied to the provided + * test function. + * + * @param NodeList nodeList The NodeList. + * @param Function fn The test function. + * @return void + */ + _findNode: function(nodeList, fn) { + return Array.prototype.find.call(nodeList, fn, this); + }, + /** * Iterate over a NodeList, return true if any of the provided iterate * function calls returns true, false otherwise. @@ -1292,12 +1310,80 @@ Readability.prototype = { }); }, + /** + * Try to extract metadata from JSON-LD object. + * For now, only Schema.org objects of type Article or its subtypes are supported. + * @return Object with any metadata that could be extracted (possibly none) + */ + _getJSONLD: function (doc) { + var scripts = this._getAllNodesWithTag(doc, ["script"]); + + var jsonLdElement = this._findNode(scripts, function(el) { + return el.getAttribute("type") === "application/ld+json"; + }); + + if (jsonLdElement) { + try { + // Strip CDATA markers if present + var content = jsonLdElement.textContent.replace(/^\s*\s*$/g, ""); + var parsed = JSON.parse(content); + var metadata = {}; + if ( + !parsed["@context"] || + !parsed["@context"].match(/^https?\:\/\/schema\.org$/) + ) { + return metadata; + } + + if (!parsed["@type"] && Array.isArray(parsed["@graph"])) { + parsed = parsed["@graph"].find(function(it) { + return (it["@type"] || "").match( + this.REGEXPS.jsonLdArticleTypes + ); + }); + } + + if ( + !parsed || + !parsed["@type"] || + !parsed["@type"].match(this.REGEXPS.jsonLdArticleTypes) + ) { + return metadata; + } + if (typeof parsed.name === "string") { + metadata.title = parsed.name.trim(); + } else if (typeof parsed.headline === "string") { + metadata.title = parsed.headline.trim(); + } + if (parsed.author && typeof parsed.author.name === "string") { + metadata.byline = parsed.author.name.trim(); + } + if (typeof parsed.description === "string") { + metadata.excerpt = parsed.description.trim(); + } + if ( + parsed.publisher && + typeof parsed.publisher.name === "string" + ) { + metadata.siteName = parsed.publisher.name.trim(); + } + return metadata; + } catch (err) { + this.log(err.message); + } + } + return {}; + }, + /** * Attempts to get excerpt and byline metadata for the article. * + * @param {Object} jsonld — object containing any metadata that + * could be extracted from JSON-LD object. + * * @return Object with optional "excerpt" and "byline" properties */ - _getArticleMetadata: function() { + _getArticleMetadata: function(jsonld) { var metadata = {}; var values = {}; var metaElements = this._doc.getElementsByTagName("meta"); @@ -1343,7 +1429,8 @@ Readability.prototype = { }); // get title - metadata.title = values["dc:title"] || + metadata.title = jsonld.title || + values["dc:title"] || values["dcterm:title"] || values["og:title"] || values["weibo:article:title"] || @@ -1356,12 +1443,14 @@ Readability.prototype = { } // get author - metadata.byline = values["dc:creator"] || + metadata.byline = jsonld.byline || + values["dc:creator"] || values["dcterm:creator"] || values["author"]; // get description - metadata.excerpt = values["dc:description"] || + metadata.excerpt = jsonld.excerpt || + values["dc:description"] || values["dcterm:description"] || values["og:description"] || values["weibo:article:description"] || @@ -1370,7 +1459,8 @@ Readability.prototype = { values["twitter:description"]; // get site name - metadata.siteName = values["og:site_name"]; + metadata.siteName = jsonld.siteName || + values["og:site_name"]; // in many sites the meta value is escaped with HTML entities, // so here we need to unescape it @@ -2029,12 +2119,15 @@ Readability.prototype = { // Unwrap image from noscript this._unwrapNoscriptImages(this._doc); + // Extract JSON-LD metadata before removing scripts + var jsonLd = this._disableJSONLD ? {} : this._getJSONLD(this._doc); + // Remove script tags from the document. this._removeScripts(this._doc); this._prepDocument(); - var metadata = this._getArticleMetadata(); + var metadata = this._getArticleMetadata(jsonLd); this._articleTitle = metadata.title; var articleContent = this._grabArticle(); diff --git a/test/test-pages/aclu/expected-metadata.json b/test/test-pages/aclu/expected-metadata.json index 9cfcc55..de60f89 100644 --- a/test/test-pages/aclu/expected-metadata.json +++ b/test/test-pages/aclu/expected-metadata.json @@ -2,7 +2,7 @@ "title": "Facebook Is Tracking Me Even Though I’m Not on Facebook", "byline": "By Daniel Kahn Gillmor, Senior Staff Technologist, ACLU Speech, Privacy, and Technology Project", "dir": "ltr", - "excerpt": "I don't use Facebook. I'm not technophobic — I'm a geek. I've been using email since the early 1990s, I have accounts on hundreds of services around the net, and I do software development and internet protocol design both for work and for fun. I believe that a globe-spanning communications network like the internet can be a positive social force, and I publish much of my own work on the open web.", + "excerpt": "Facebook collects data about people who have never even opted in. But there are ways these non-users can protect themselves.", "readerable": true, "siteName": "American Civil Liberties Union" } diff --git a/test/test-pages/bbc-1/expected-metadata.json b/test/test-pages/bbc-1/expected-metadata.json index 61211f0..ae869e3 100644 --- a/test/test-pages/bbc-1/expected-metadata.json +++ b/test/test-pages/bbc-1/expected-metadata.json @@ -1,5 +1,5 @@ { - "title": "Obama admits US gun laws are his 'biggest frustration' - BBC News", + "title": "Obama admits US gun laws are his 'biggest frustration'", "byline": null, "excerpt": "President Barack Obama tells the BBC his failure to pass \"common sense gun safety laws\" is the greatest frustration of his presidency.", "readerable": true, diff --git a/test/test-pages/folha/expected-metadata.json b/test/test-pages/folha/expected-metadata.json index 8bdaacf..09b8dc3 100644 --- a/test/test-pages/folha/expected-metadata.json +++ b/test/test-pages/folha/expected-metadata.json @@ -2,7 +2,7 @@ "title": "Tite diz que errou ao levar taça da Libertadores a Lula em 2012", "byline": "21.dez.2018 às 10h55", "dir": null, - "excerpt": "Após rechaçar um encontro da seleção brasileira com o presidente eleito Jair Bolsonaro, o técnico Tite declarou que errou ao levar a taça da Copa Libertadores de 2012, conquistada pelo Corinthians, ao ex-presidente Luiz Inácio Lula da Silva.", + "excerpt": "Na ocasião, técnico do Corinthians entregou réplica do troféu ao ex-presidente", "siteName": "Folha de S.Paulo", "readerable": true } diff --git a/test/test-pages/folha/source.html b/test/test-pages/folha/source.html index b8030a4..03f749c 100644 --- a/test/test-pages/folha/source.html +++ b/test/test-pages/folha/source.html @@ -204,7 +204,7 @@ "description": "Na ocasião, técnico do Corinthians entregou réplica do troféu ao ex-presidente", "datePublished": "2018-12-21T12:55:00Z", - "image": { "@type": "ImageObject", "url": "https://f.i.uol.com.br/fotografia/2018/12/21/15454034955c1cfc67131dc_1545403495_3x2_md.jpg", "width": "768", "height": "512" } + "image": { "@type": "ImageObject", "url": "https://f.i.uol.com.br/fotografia/2018/12/21/15454034955c1cfc67131dc_1545403495_3x2_md.jpg", "width": "768", "height": "512" }, "contentLocation": { "@type": "Place", diff --git a/test/test-pages/lazy-image-1/expected-metadata.json b/test/test-pages/lazy-image-1/expected-metadata.json index 2f5245b..199e348 100644 --- a/test/test-pages/lazy-image-1/expected-metadata.json +++ b/test/test-pages/lazy-image-1/expected-metadata.json @@ -2,7 +2,7 @@ "title": "Node.js and CPU profiling on production (in real-time without downtime)", "byline": "Vincent Vallet", "dir": null, - "excerpt": "Why CPU monitoring is important?", - "siteName": "Medium", + "excerpt": "How to run a CPU profiling with Node.js on your production in real-time and without interruption of service.", + "siteName": "Voodoo Engineering", "readerable": true } diff --git a/test/test-pages/videos-1/expected-metadata.json b/test/test-pages/videos-1/expected-metadata.json index 401ebb6..def1553 100644 --- a/test/test-pages/videos-1/expected-metadata.json +++ b/test/test-pages/videos-1/expected-metadata.json @@ -1,8 +1,8 @@ { - "title": "The 21 best movies of 2017", - "byline": "By Alissa Wilkinson@alissamarie\n Updated Jul 24, 2018, 2:15pm EDT", + "title": "How to watch the 21 best films of 2017", + "byline": "Alissa Wilkinson", "dir": null, - "excerpt": "How to watch the greatest movies of the year, from Lady Bird and Dunkirk to Get Out and The Big Sick.", + "excerpt": "It was an extraordinary year for movies.", "siteName": "Vox", "readerable": true } diff --git a/test/test-pages/videos-2/expected-metadata.json b/test/test-pages/videos-2/expected-metadata.json index 64c215a..50f0118 100644 --- a/test/test-pages/videos-2/expected-metadata.json +++ b/test/test-pages/videos-2/expected-metadata.json @@ -2,7 +2,7 @@ "title": "Screenshot : «Vape Wave», «6 Days», «Alphonse Président»…", "byline": "Par Alexandre Hervaud et Jérémy Piette", "dir": null, - "excerpt": "Séries, documentaires, programmes jeunesse… Retrouvez les recommandations de Libération pour savoir quoi regarder sur vos écrans cette semaine. Pour dépasser...", - "siteName": "Libération.fr", + "excerpt": "Séries, documentaires, programmes jeunesse… Retrouvez les recommandations de Libération pour savoir quoi regarder sur vos écrans cette semaine.\nPour dépasser...", + "siteName": "Libération", "readerable": true } diff --git a/test/test-pages/wikipedia-2/expected-metadata.json b/test/test-pages/wikipedia-2/expected-metadata.json index e03a57c..fd1a2e3 100644 --- a/test/test-pages/wikipedia-2/expected-metadata.json +++ b/test/test-pages/wikipedia-2/expected-metadata.json @@ -1,8 +1,8 @@ { - "title": "New Zealand - Wikipedia", - "byline": "Authority control", + "title": "New Zealand", + "byline": "Contributors to Wikimedia projects", "dir": "ltr", "excerpt": "Coordinates: 42°S 174°E / 42°S 174°E", - "siteName": null, + "siteName": "Wikimedia Foundation, Inc.", "readerable": true } diff --git a/test/test-pages/wikipedia-3/expected-metadata.json b/test/test-pages/wikipedia-3/expected-metadata.json index 5755c16..1b105e8 100644 --- a/test/test-pages/wikipedia-3/expected-metadata.json +++ b/test/test-pages/wikipedia-3/expected-metadata.json @@ -1,8 +1,8 @@ { - "title": "Hermitian matrix - Wikipedia", - "byline": null, + "title": "Hermitian matrix", + "byline": "Contributors to Wikimedia projects", "dir": "ltr", "excerpt": "In mathematics, a Hermitian matrix (or self-adjoint matrix) is a complex square matrix that is equal to its own conjugate transpose—that is, the element in the i-th row and j-th column is equal to the complex conjugate of the element in the j-th row and i-th column, for all indices i and j:", - "siteName": null, + "siteName": "Wikimedia Foundation, Inc.", "readerable": true } diff --git a/test/test-readability.js b/test/test-readability.js index 73e1a1d..0adbc79 100644 --- a/test/test-readability.js +++ b/test/test-readability.js @@ -157,23 +157,23 @@ function runTestsWithItems(label, domGenerationFn, source, expectedContent, expe }); it("should extract expected title", function() { - expect(expectedMetadata.title).eql(result.title); + expect(result.title).eql(expectedMetadata.title); }); it("should extract expected byline", function() { - expect(expectedMetadata.byline).eql(result.byline); + expect(result.byline).eql(expectedMetadata.byline); }); it("should extract expected excerpt", function() { - expect(expectedMetadata.excerpt).eql(result.excerpt); + expect(result.excerpt).eql(expectedMetadata.excerpt); }); it("should extract expected site name", function() { - expect(expectedMetadata.siteName).eql(result.siteName); + expect(result.siteName).eql(expectedMetadata.siteName); }); expectedMetadata.dir && it("should extract expected direction", function() { - expect(expectedMetadata.dir).eql(result.dir); + expect(result.dir).eql(expectedMetadata.dir); }); }); }