diff --git a/plugins/opds.koplugin/opdsparser.lua b/plugins/opds.koplugin/opdsparser.lua index 20ee727ed..a1c247a99 100644 --- a/plugins/opds.koplugin/opdsparser.lua +++ b/plugins/opds.koplugin/opdsparser.lua @@ -72,10 +72,6 @@ function OPDSParser:createFlatXTable(xlex, curr_element) end function OPDSParser:parse(text) - -- Murder Calibre's whole "content" block, because luxl doesn't really deal well with various XHTML quirks, - -- as the list of crappy replacements below attests to... - -- There's also a high probability of finding orphaned tags or badly nested ones in there, which will screw everything up. - text = text:gsub('.-', '') -- luxl doesn't handle XML comments, so strip them text = text:gsub("", "") -- luxl is also particular about the syntax for self-closing, empty & orphaned tags... @@ -84,8 +80,18 @@ function OPDSParser:parse(text) text = text:gsub("<([bh]r)>", "<%1 />") -- Some OPDS catalogs wrap text in a CDATA section, remove it as it causes parsing problems text = text:gsub("", function (s) - return s:gsub( "%p", {["&"] = "&", ["<"] = "<", [">"] = ">" } ) + return s:gsub("%p", {["&"] = "&", ["<"] = "<", [">"] = ">"}) end ) + + -- NOTE: OPDS content tags are likely to contain a bunch of HTML or XHTML. We do *NOT* want to let luxl parse that, + -- because it doesn't really deal well with various XHTML quirks, as the list of crappy replacements above attests to... + -- There's also a high probability of finding orphaned tags or badly nested ones in there, which would screw everything up. + -- In any case, we just want to treat the whole thing as a single text node anyway, so, just mangle the markup to force luxl's hand. + text = text:gsub('', "") + text = text:gsub("(.-)", function (s) + return '' .. s:gsub("%p", {["<"] = "<", [">"] = ">", ['"'] = """, ["'"] = "'"}) .. "" + end ) + local xlex = luxl.new(text, #text) return assert(self:createFlatXTable(xlex)) end