From 960b2ae62ae71d17cf59d22ca7ae249884de5873 Mon Sep 17 00:00:00 2001 From: NiLuJe Date: Sun, 22 Nov 2020 04:51:32 +0100 Subject: [PATCH] OPDS*: Mangle Calibre feeds some more so that they don't confuse luxl/us (#6902) By essentially dropping the whole XHTML block, instead of trying to salvage each and every tag one by one as we did before. Also, as that's usually the result after broken parsing, handle nil URLs slightly better in the frontend, so that they get caught/reported properly instead of doing nothing and/or crashing half the time. --- frontend/ui/opdsparser.lua | 26 ++++++++++++++++---------- frontend/ui/widget/opdsbrowser.lua | 18 +++++++++--------- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/frontend/ui/opdsparser.lua b/frontend/ui/opdsparser.lua index a78eadcf7..a1321b3a9 100644 --- a/frontend/ui/opdsparser.lua +++ b/frontend/ui/opdsparser.lua @@ -33,13 +33,12 @@ end function OPDSParser:createFlatXTable(xlex, curr_element) curr_element = curr_element or {} - local curr_attr_name; - local attr_count = 0; + local curr_attr_name + local attr_count = 0 -- start reading the thing - local txt for event, offset, size in xlex:Lexemes() do - txt = ffi.string(xlex.buf + offset, size) + local txt = ffi.string(xlex.buf + offset, size) if event == luxl.EVENT_START then if txt ~= "xml" then -- does current element already have something @@ -61,7 +60,7 @@ function OPDSParser:createFlatXTable(xlex, curr_element) curr_attr_name = unescape(txt) elseif event == luxl.EVENT_ATTR_VAL then curr_element[curr_attr_name] = unescape(txt) - attr_count = attr_count + 1; + attr_count = attr_count + 1 curr_attr_name = nil elseif event == luxl.EVENT_TEXT then curr_element = unescape(txt) @@ -73,16 +72,23 @@ function OPDSParser:createFlatXTable(xlex, curr_element) end function OPDSParser:parse(text) - -- luxl cannot properly handle xml comments and we need first remove them - text = text:gsub("", "") - -- luxl prefers
, other two forms are valid in HTML, - -- but will kick the ass of luxl + -- Murder Calibre's whole "content" block, because luxl doesn't really deal well with various XHTML quirks, + -- as the list of crappy replacements below attests to... + -- There's also a high probability of finding orphaned tags or badly nested ones in there, which will screw everything up. + text = text:gsub('.-', '') + -- luxl doesn't handle XML comments, so strip them + text = text:gsub("", "") + -- luxl prefers
, the other two forms are valid in HTML, but will kick luxl's ass text = text:gsub("
", "
") text = text:gsub("
", "
") -- Same deal with hr text = text:gsub("
", "
") text = text:gsub("
", "
") - -- some OPDS catalogs wrap text in a CDATA section, remove it as it causes parsing problems + -- It's also allergic to orphaned (As opposed to a balanced pair)... + text = text:gsub("", "") + -- Let's assume it might also happen to strong... + text = text:gsub("", "") + -- Some OPDS catalogs wrap text in a CDATA section, remove it as it causes parsing problems text = text:gsub("", function (s) return s:gsub( "%p", {["&"] = "&", ["<"] = "<", [">"] = ">" } ) end ) diff --git a/frontend/ui/widget/opdsbrowser.lua b/frontend/ui/widget/opdsbrowser.lua index 62a2a4780..b1277115c 100644 --- a/frontend/ui/widget/opdsbrowser.lua +++ b/frontend/ui/widget/opdsbrowser.lua @@ -78,7 +78,7 @@ function OPDSBrowser:init() servers = { { title = "Project Gutenberg", - url = "http://m.gutenberg.org/ebooks.opds/?format=opds", + url = "https://m.gutenberg.org/ebooks.opds/?format=opds", }, { title = "Project Gutenberg [Searchable]", @@ -87,11 +87,11 @@ function OPDSBrowser:init() }, { title = "Feedbooks", - url = "http://www.feedbooks.com/publicdomain/catalog.atom", + url = "https://catalog.feedbooks.com/catalog/public_domain.atom", }, { title = "ManyBooks", - url = "http://manybooks.net/opds/index.php", + url = "https://manybooks.net/opds/index.php", }, { title = "Internet Archive", @@ -99,11 +99,11 @@ function OPDSBrowser:init() }, { title = "Flibusta (Russian)", - url = "http://www.flibusta.is/opds", + url = "https://www.flibusta.is/opds", }, { title = "Flibusta [Ru] [Searchable]", - url = "http://www.flibusta.is/opds/search?searchTerm=%s", + url = "https://www.flibusta.is/opds/search?searchTerm=%s", searchable = true, }, { @@ -388,9 +388,9 @@ end function OPDSBrowser:getCatalog(item_url, username, password) local ok, catalog = pcall(self.parseFeed, self, item_url, username, password) if not ok and catalog then - logger.info("cannot get catalog info from", item_url, catalog) + logger.info("cannot get catalog info from", item_url or "nil", catalog) UIManager:show(InfoMessage:new{ - text = T(_("Cannot get catalog info from %1"), (BD.url(item_url) or "")), + text = T(_("Cannot get catalog info from %1"), (item_url and BD.url(item_url) or "nil")), }) return end @@ -498,7 +498,7 @@ function OPDSBrowser:genItemTableFromCatalog(catalog, item_url, username, passwo end end if author then - item.text = title .. "\n" .. author + item.text = title .. " - " .. author end end item.title = title @@ -676,7 +676,7 @@ function OPDSBrowser:showDownloads(item) end function OPDSBrowser:browse(browse_url, username, password) - logger.dbg("Browse opds url", browse_url) + logger.dbg("Browse opds url", browse_url or "nil") table.insert(self.paths, { url = browse_url, username = username,