Create a script to generate testcases, actually use our version of JSDOMParser

pull/31/head
Gijs Kruitbosch 9 years ago
parent b5e905ef6b
commit 1c42f29aa5

@ -105,6 +105,8 @@ Readability.prototype = {
DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ],
ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"],
/**
* Run any post-process modifications to article content as necessary.
*
@ -668,7 +670,7 @@ Readability.prototype = {
s -= 1;
sl -= 1;
if (siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") {
if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(siblingNode.nodeName) === -1) {
// We have a node that isn't a common block level element, like a form or td tag.
// Turn it into a div so it doesn't get filtered out later by accident. */
this.log("Altering siblingNode:", siblingNode, 'to div.');

@ -4,7 +4,8 @@
"description": "A standalone version of the readability library used for Firefox Reader View.",
"main": "Readability.js",
"scripts": {
"test": "mocha"
"test": "mocha test/index.js",
"generate-testcase": "node test/generate-testcase.js"
},
"repository": {
"type": "git",

@ -0,0 +1,145 @@
var debug = false;
var path = require("path");
var fs = require("fs");
var jsdom = require("jsdom").jsdom;
var prettyPrint = require("html").prettyPrint;
var serializeDocument = require("jsdom").serializeDocument;
var http = require("http");
// We want to load Readability and JSDOMParser, which aren't set up as commonjs libraries,
// and so we need to do some hocus-pocus with 'vm' to import them on a separate scope
// (identical) scope context.
var vm = require("vm");
var readabilityPath = path.join(__dirname, "..", "Readability.js");
var jsdomPath = path.join(__dirname, "..", "JSDOMParser.js");
var scopeContext = {};
// We generally expect dump() and console.{whatever} to work, so make these available
// in the scope we're using:
scopeContext.dump = console.log
scopeContext.console = console;
// Actually load files. NB: if either of the files has parse errors,
// node is dumb and shows you a syntax error *at this callsite* . Don't try to find
// a syntax error on this line, there isn't one. Go look in the file it's loading instead.
vm.runInNewContext(fs.readFileSync(jsdomPath), scopeContext, jsdomPath);
vm.runInNewContext(fs.readFileSync(readabilityPath), scopeContext, readabilityPath);
// Now make references to the globals in our scope so we can use them easily:
var Readability = scopeContext.Readability;
var JSDOMParser = scopeContext.JSDOMParser;
if (process.argv.length < 3) {
console.error("Need at least a destination slug and potentially a URL (if the slug doesn't have source).");
process.exit(0);
return;
}
var slug = process.argv[2];
var url = process.argv[3]; // Could be undefined, we'll warn if it is if that is an issue.
var destRoot = path.join(__dirname, "test-pages", slug);
fs.mkdir(destRoot, function(err) {
if (err) {
var sourceFile = path.join(destRoot, "source.html");
fs.exists(sourceFile, function(exists) {
if (exists) {
fs.readFile(sourceFile, {encoding: "utf-8"}, function(err, data) {
if (err) {
console.error("Source existed but couldn't be read?");
process.exit(1);
return;
}
onResponseReceived(data);
});
} else {
fetchSource(url, onResponseReceived);
}
});
return;
}
fetchSource(url, onResponseReceived);
});
function fetchSource(url, callbackFn) {
if (!url) {
console.error("You should pass a URL if the source doesn't exist yet!");
process.exit(1);
return;
}
var client = http;
if (url.indexOf("https") == 0) {
client = require("https");
}
client.get(url, function(response) {
if (debug) {
console.log("STATUS:", response.statusCode);
console.log("HEADERS:", JSON.stringify(response.headers));
}
response.setEncoding("utf-8");
var rv = "";
response.on("data", function(chunk) {
rv += chunk;
});
response.on("end", function() {
if (debug) {
console.log("End received");
}
callbackFn(rv);
});
});
}
function onResponseReceived(source) {
// Sanitize:
source = prettyPrint(serializeDocument(jsdom(source)));
if (debug) {
console.log("writing");
}
var sourcePath = path.join(destRoot, "source.html");
fs.writeFile(sourcePath, source, function(err) {
if (err) {
console.error("Couldn't write data to source.html!");
console.error(err);
return;
}
if (debug) {
console.log("Running readability stuff");
}
runReadability(source, path.join(destRoot, "expected.html"));
});
}
function runReadability(source, destPath) {
var doc = new JSDOMParser().parse(source);
var uri = {
spec: "http://fakehost/test/page.html",
host: "fakehost",
prePath: "http://fakehost",
scheme: "http",
pathBase: "http://fakehost/test"
};
try {
var result = new Readability(uri, doc).parse();
} catch (ex) {
console.error(ex);
}
if (!result) {
console.error("No content generated by readability, not going to write expected.html!");
return;
}
fs.writeFile(destPath, prettyPrint(result.content), function(err) {
if (err) {
console.error("Couldn't write data to expected.html!");
console.error(err);
}
process.exit(0);
});
}

@ -1,9 +1,32 @@
var path = require("path");
var fs = require("fs");
var jsdom = require("jsdom");
var prettyPrint = require("html").prettyPrint;
var expect = require("chai").expect;
// We want to load Readability and JSDOMParser, which aren't set up as commonjs libraries,
// and so we need to do some hocus-pocus with 'vm' to import them on a separate scope
// (identical) scope context.
var vm = require("vm");
var readabilityPath = path.join(__dirname, "..", "Readability.js");
var jsdomPath = path.join(__dirname, "..", "JSDOMParser.js");
var scopeContext = {};
// We generally expect dump() and console.{whatever} to work, so make these available
// in the scope we're using:
scopeContext.dump = console.log
scopeContext.console = console;
// Actually load files. NB: if either of the files has parse errors,
// node is dumb and shows you a syntax error *at this callsite* . Don't try to find
// a syntax error on this line, there isn't one. Go look in the file it's loading instead.
vm.runInNewContext(fs.readFileSync(jsdomPath), scopeContext, jsdomPath);
vm.runInNewContext(fs.readFileSync(readabilityPath), scopeContext, readabilityPath);
// Now make references to the globals in our scope so we can use them easily:
var Readability = scopeContext.Readability;
var JSDOMParser = scopeContext.JSDOMParser;
var testPageRoot = path.join(__dirname, "test-pages");
var testPages = fs.readdirSync(testPageRoot).map(function(dir) {
return {
@ -17,40 +40,19 @@ describe("Test page", function() {
testPages.forEach(function(testPage) {
describe(testPage.dir, function() {
it("should render as expected", function(done) {
var source = fs.readFileSync(testPage.source, {encoding: "utf-8"});
var expected = fs.readFileSync(testPage.expected, {encoding: "utf-8"});
jsdom.env(
testPage.source,
[path.join(__dirname, "..", "Readability.js")],
{
features: {
FetchExternalResources : [],
ProcessExternalResources: false,
SkipExternalResources: false
},
created: function(errors, window) {
jsdom.getVirtualConsole(window).on("log", function() {
// Very strange argument set passed to describe console.log messages…
if (arguments[0].indexOf("Reader:") === 0) {
console.log(arguments[0], arguments[1][0]);
}
});
}
},
function (errors, window) {
expect(errors).eql(null);
var uri = {
spec: "http://fakehost/test/page.html",
host: "fakehost",
prePath: "http://fakehost",
scheme: "http",
pathBase: "http://fakehost/test"
};
var result = new window.Readability(uri, window.document).parse();
expect(prettyPrint(result.content)).eql(prettyPrint(expected))
done();
}
);
var source = fs.readFileSync(testPage.source, {encoding: "utf-8"});
var uri = {
spec: "http://fakehost/test/page.html",
host: "fakehost",
prePath: "http://fakehost",
scheme: "http",
pathBase: "http://fakehost/test"
};
var doc = new JSDOMParser().parse(source);
var result = new Readability(uri, doc).parse();
expect(prettyPrint(result.content)).eql(expected);
done();
});
});
});

Loading…
Cancel
Save