Wiki Save as EPUB: better handling of math formulas (#3639)

Also fix some bad filename and extension in epub's OEBPS/content.opf,
Store image files (jpg, png, gif) uncompressed in the .epub as they
don't need compression.
pull/3640/head
poire-z 6 years ago committed by Frans de Jonge
parent e7377cd20e
commit 5cfe9992ad

@ -690,14 +690,36 @@ function Wikipedia:createEpub(epub_path, page, lang, with_images)
elseif src:sub(1,1) == "/" then -- non absolute url
src = wiki_base_url .. src
end
-- Some SVG urls don't have any extension, like:
-- "/api/rest_v1/media/math/render/svg/154a342afea5a9f13caf1a5bb6acd5c4e69733b6""
-- Furthermore, as of early 2018, it looks like most (all?) mathematical SVG
-- obtained from such urls use features not supported by crengine's nanosvg
-- renderer (so, they are displayed as a blank square).
-- But we can get a PNG version of it thanks to wikipedia APIs :
-- https://wikimedia.org/api/rest_v1/#!/Math/get_media_math_render_format_hash
-- We tweak the url now (and fix the mimetype below), before checking for
-- duplicates in seen_images.
-- Think about disabling that when nanosvg gets better!
if src:find("/math/render/svg/") then
src = src:gsub("/math/render/svg/", "/math/render/png/")
end
local cur_image
if seen_images[src] then -- already seen
cur_image = seen_images[src]
else
local ext = src:match(".*%.(%S+)")
if ext == nil or ext == "" then -- we won't know what mimetype to use, ignore it
logger.info("no file extension found in ", src)
return nil
local src_ext = src
if src_ext:find("?") then -- "/w/extensions/wikihiero/img/hiero_D22.png?0b8f1"
src_ext = src_ext:match("(.-)%?") -- remove ?blah
end
local ext = src_ext:match(".*%.(%S%S%S?%S?%S?)$") -- extensions are only 2 to 5 chars
if ext == nil or ext == "" then
if src_ext:find("/math/render/png/") then -- tweaked above
ext = "png"
else
-- we won't know what mimetype to use, ignore it
logger.info("no file extension found in ", src)
return nil
end
end
ext = ext:lower()
local imgid = string.format("img%05d", imagenum)
@ -1039,6 +1061,16 @@ time, abbr, sup {
-- want to remove or replace those that should be considered 'inline' elements
html = html:gsub("</?time[^>]*>", "")
-- crengine does not support the <math> family of tags for displaying formulas,
-- which results in lots of space taken by individual character in the formula, each
-- on a single line...
-- Also, usally, these <math> tags are followed by a <img> tag pointing to a
-- SVG version of the formula, that we took care earlier to change the url to
-- point to a PNG version of the formula (which is still not perfect, as it does
-- not adjust to the current html font size, but it is at least readable).
-- So, remove the whole <math>...</math> content
html = html:gsub([[<math xmlns="http://www.w3.org/1998/Math/MathML".-</math>]], "")
-- Fix internal wikipedia links with full server url (including lang) so
-- ReaderLink can notice them and deal with them with a LookupWikipedia event.
-- html = html:gsub([[href="/wiki/]], [[href="]]..wiki_base_url..[[/wiki/]])
@ -1154,7 +1186,12 @@ time, abbr, sup {
logger.info("failed fetching:", src)
end
if success then
epub:add("OEBPS/"..img.imgpath, content)
-- Images do not need to be compressed, so spare some cpu cycles
local no_compression = true
if img.mimetype == "image/svg+xml" then -- except for SVG images (which are XML text)
no_compression = false
end
epub:add("OEBPS/"..img.imgpath, content, no_compression)
else
go_on = UI:confirm(T(_("Downloading image %1 failed. Continue anyway?"), inum), _("Stop"), _("Continue"))
if not go_on then

Loading…
Cancel
Save