OPDSParser: Attempt to preserve data from content tags *without* breaking luxl (#7768)

Tackle the content blocks issue differently, in order to
preserve the data, which is now useful since #7767
reviewable/pr7776/r1
NiLuJe 3 years ago committed by GitHub
parent d2ad6a83e1
commit c2c20199cd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -72,10 +72,6 @@ function OPDSParser:createFlatXTable(xlex, curr_element)
end
function OPDSParser:parse(text)
-- Murder Calibre's whole "content" block, because luxl doesn't really deal well with various XHTML quirks,
-- as the list of crappy replacements below attests to...
-- There's also a high probability of finding orphaned tags or badly nested ones in there, which will screw everything up.
text = text:gsub('<content type="xhtml">.-</content>', '')
-- luxl doesn't handle XML comments, so strip them
text = text:gsub("<!%-%-.-%-%->", "")
-- luxl is also particular about the syntax for self-closing, empty & orphaned tags...
@ -84,8 +80,18 @@ function OPDSParser:parse(text)
text = text:gsub("<([bh]r)>", "<%1 />")
-- Some OPDS catalogs wrap text in a CDATA section, remove it as it causes parsing problems
text = text:gsub("<!%[CDATA%[(.-)%]%]>", function (s)
return s:gsub( "%p", {["&"] = "&amp;", ["<"] = "&lt;", [">"] = "&gt;" } )
return s:gsub("%p", {["&"] = "&amp;", ["<"] = "&lt;", [">"] = "&gt;"})
end )
-- NOTE: OPDS content tags are likely to contain a bunch of HTML or XHTML. We do *NOT* want to let luxl parse that,
-- because it doesn't really deal well with various XHTML quirks, as the list of crappy replacements above attests to...
-- There's also a high probability of finding orphaned tags or badly nested ones in there, which would screw everything up.
-- In any case, we just want to treat the whole thing as a single text node anyway, so, just mangle the markup to force luxl's hand.
text = text:gsub('<content type=".-">', "<content>")
text = text:gsub("<content>(.-)</content>", function (s)
return '<content type="text">' .. s:gsub("%p", {["<"] = "&lt;", [">"] = "&gt;", ['"'] = "&quot;", ["'"] = "&apos;"}) .. "</content>"
end )
local xlex = luxl.new(text, #text)
return assert(self:createFlatXTable(xlex))
end

Loading…
Cancel
Save