From 5cfe9992adf2169f07ee0f26813859dbe892375f Mon Sep 17 00:00:00 2001 From: poire-z Date: Mon, 29 Jan 2018 20:50:26 +0100 Subject: [PATCH] Wiki Save as EPUB: better handling of math formulas (#3639) Also fix some bad filename and extension in epub's OEBPS/content.opf, Store image files (jpg, png, gif) uncompressed in the .epub as they don't need compression. --- frontend/ui/wikipedia.lua | 47 ++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/frontend/ui/wikipedia.lua b/frontend/ui/wikipedia.lua index 5d818c14b..6c1e7f96a 100644 --- a/frontend/ui/wikipedia.lua +++ b/frontend/ui/wikipedia.lua @@ -690,14 +690,36 @@ function Wikipedia:createEpub(epub_path, page, lang, with_images) elseif src:sub(1,1) == "/" then -- non absolute url src = wiki_base_url .. src end + -- Some SVG urls don't have any extension, like: + -- "/api/rest_v1/media/math/render/svg/154a342afea5a9f13caf1a5bb6acd5c4e69733b6"" + -- Furthermore, as of early 2018, it looks like most (all?) mathematical SVG + -- obtained from such urls use features not supported by crengine's nanosvg + -- renderer (so, they are displayed as a blank square). + -- But we can get a PNG version of it thanks to wikipedia APIs : + -- https://wikimedia.org/api/rest_v1/#!/Math/get_media_math_render_format_hash + -- We tweak the url now (and fix the mimetype below), before checking for + -- duplicates in seen_images. + -- Think about disabling that when nanosvg gets better! + if src:find("/math/render/svg/") then + src = src:gsub("/math/render/svg/", "/math/render/png/") + end local cur_image if seen_images[src] then -- already seen cur_image = seen_images[src] else - local ext = src:match(".*%.(%S+)") - if ext == nil or ext == "" then -- we won't know what mimetype to use, ignore it - logger.info("no file extension found in ", src) - return nil + local src_ext = src + if src_ext:find("?") then -- "/w/extensions/wikihiero/img/hiero_D22.png?0b8f1" + src_ext = src_ext:match("(.-)%?") -- remove ?blah + end + local ext = src_ext:match(".*%.(%S%S%S?%S?%S?)$") -- extensions are only 2 to 5 chars + if ext == nil or ext == "" then + if src_ext:find("/math/render/png/") then -- tweaked above + ext = "png" + else + -- we won't know what mimetype to use, ignore it + logger.info("no file extension found in ", src) + return nil + end end ext = ext:lower() local imgid = string.format("img%05d", imagenum) @@ -1039,6 +1061,16 @@ time, abbr, sup { -- want to remove or replace those that should be considered 'inline' elements html = html:gsub("]*>", "") + -- crengine does not support the family of tags for displaying formulas, + -- which results in lots of space taken by individual character in the formula, each + -- on a single line... + -- Also, usally, these tags are followed by a tag pointing to a + -- SVG version of the formula, that we took care earlier to change the url to + -- point to a PNG version of the formula (which is still not perfect, as it does + -- not adjust to the current html font size, but it is at least readable). + -- So, remove the whole ... content + html = html:gsub([[]], "") + -- Fix internal wikipedia links with full server url (including lang) so -- ReaderLink can notice them and deal with them with a LookupWikipedia event. -- html = html:gsub([[href="/wiki/]], [[href="]]..wiki_base_url..[[/wiki/]]) @@ -1154,7 +1186,12 @@ time, abbr, sup { logger.info("failed fetching:", src) end if success then - epub:add("OEBPS/"..img.imgpath, content) + -- Images do not need to be compressed, so spare some cpu cycles + local no_compression = true + if img.mimetype == "image/svg+xml" then -- except for SVG images (which are XML text) + no_compression = false + end + epub:add("OEBPS/"..img.imgpath, content, no_compression) else go_on = UI:confirm(T(_("Downloading image %1 failed. Continue anyway?"), inum), _("Stop"), _("Continue")) if not go_on then