Fix script parsing to ignore closing tags in comments

pull/87/head
Gijs Kruitbosch 9 years ago
parent 0b3bce57bf
commit 8ddba60425

@ -919,14 +919,59 @@
},
readScript: function (node) {
var index = this.html.indexOf("</script>", this.currentChar);
if (index === -1) {
index = this.html.length;
while (this.currentChar < this.html.length) {
var c = this.nextChar();
var nextC = this.peekNext();
if (c === "<") {
if (nextC === "!" || nextC === "?") {
// We're still before the ! or ? that is starting this comment:
this.currentChar++;
node.appendChild(this.discardNextComment());
continue;
}
if (nextC === "/" && this.html.substr(this.currentChar, 8 /*"/script>".length */).toLowerCase() == "/script>") {
// Go back before the '<' so we find the end tag.
this.currentChar--;
// Done with this script tag, the caller will close:
return;
}
}
// Either c wasn't a '<' or it was but we couldn't find either a comment
// or a closing script tag, so we should just parse as text until the next one
// comes along:
var haveTextNode = node.lastChild && node.lastChild.nodeType === Node.TEXT_NODE;
var textNode = haveTextNode ? node.lastChild : new Text();
var n = this.html.indexOf("<", this.currentChar);
// Decrement this to include the current character *afterwards* so we don't get stuck
// looking for the same < all the time.
this.currentChar--;
if (n === -1) {
textNode.textContent += this.html.substring(this.currentChar, this.html.length);
this.currentChar = this.html.length;
} else {
textNode.textContent += this.html.substring(this.currentChar, n);
this.currentChar = n;
}
if (!haveTextNode)
node.appendChild(textNode);
}
var txt = new Text();
txt.textContent = this.html.substring(this.currentChar, index === -1 ? this.html.length : index);
node.appendChild(txt);
this.currentChar = index;
},
discardNextComment: function() {
if (this.match("--")) {
this.discardTo("-->");
} else {
var c = this.nextChar();
while (c !== ">") {
if (c === undefined)
return null;
if (c === '"' || c === "'")
this.readString(c);
c = this.nextChar();
}
}
return new Comment();
},
@ -964,20 +1009,9 @@
// them away in readChildren()). So just returning an empty Comment node
// here is sufficient.
if (c === "!" || c === "?") {
// We're still before the ! or ? that is starting this comment:
this.currentChar++;
if (this.match("--")) {
this.discardTo("-->");
} else {
var c = this.nextChar();
while (c !== ">") {
if (c === undefined)
return null;
if (c === '"' || c === "'")
this.readString(c);
c = this.nextChar();
}
}
return new Comment();
return this.discardNextComment();
}
// If we're reading a closing tag, return null. This means we've reached

@ -208,3 +208,42 @@ describe("Test JSDOM functionality", function() {
}
});
});
describe("Script parsing", function() {
it("should strip ?-based comments within script tags", function() {
var html = '<script><?Silly test <img src="test"></script>';
var doc = new JSDOMParser().parse(html);
expect(doc.firstChild.tagName).eql("SCRIPT");
expect(doc.firstChild.textContent).eql("");
expect(doc.firstChild.children.length).eql(0);
expect(doc.firstChild.childNodes.length).eql(1);
});
it("should strip !-based comments within script tags", function() {
var html = '<script><!--Silly test > <script src="foo.js"></script>--></script>';
var doc = new JSDOMParser().parse(html);
expect(doc.firstChild.tagName).eql("SCRIPT");
expect(doc.firstChild.textContent).eql("");
expect(doc.firstChild.children.length).eql(0);
expect(doc.firstChild.childNodes.length).eql(1);
});
it("should strip any other nodes within script tags", function() {
var html = "<script><div>Hello, I'm not really in a </div></script>";
var doc = new JSDOMParser().parse(html);
expect(doc.firstChild.tagName).eql("SCRIPT");
expect(doc.firstChild.textContent).eql("<div>Hello, I'm not really in a </div>");
expect(doc.firstChild.children.length).eql(0);
expect(doc.firstChild.childNodes.length).eql(1);
});
it("should not be confused by partial closing tags", function() {
var html = "<script>var x = '<script>Hi<' + '/script>';</script>";
var doc = new JSDOMParser().parse(html);
expect(doc.firstChild.tagName).eql("SCRIPT");
expect(doc.firstChild.textContent).eql("var x = '<script>Hi<' + '/script>';");
expect(doc.firstChild.children.length).eql(0);
expect(doc.firstChild.childNodes.length).eql(1);
});
});

@ -0,0 +1,5 @@
{
"title": "Test script parsing",
"byline": null,
"excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua."
}

@ -0,0 +1,19 @@
<div id="readability-page-1" class="page">
<div>
<p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua.</p>
<p>Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi
ut aliquip ex ea commodo consequat.</p>
<p>Duis aute irure dolor in reprehenderit in voluptate velit esse cillum
dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
</div>
<div>
<p>Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat.</p>
<p>Duis aute irure dolor in reprehenderit in voluptate velit esse cillum
dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
</div>
</div>

@ -0,0 +1,34 @@
<html>
<head><title>Test script parsing</title></head>
<body>
<script>
<!--
Silly test
<script src="foo.js"></script>
-->
</script>
<article>
<h1>Lorem</h1>
<div>
<p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua.</p>
<p>Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat.</p>
<p>Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
</div>
<h2>Foo</h2>
<div>
<p>Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat.</p>
<p>Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur.
Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
</div>
</article>
</body>
</html>
Loading…
Cancel
Save