From 2ded2bce5e27507020a82f6c3e64c37c260a73b6 Mon Sep 17 00:00:00 2001 From: poire-z Date: Sat, 21 Jan 2017 19:23:13 +0100 Subject: [PATCH] Wikipedia: save full page as epub (html, with optional images) Also for the current text-only wikipedia full page: replace the ==s in section titles with some unicode symbols for a better visual feeling of hierarchy. These same symbols are also used in the .epub. Both can be disabled by adding ["wikipedia_prettify"] = false to settings.reader.lua readerhighlight: close ButtonTable (like other buttons do) when looking up wikipedia, otherwise if we save as epub, and swtich to new document, this ButtonTable will never be closed and stays in UI. --- base | 2 +- .../apps/reader/modules/readerhighlight.lua | 1 + .../apps/reader/modules/readerwikipedia.lua | 14 +- frontend/ui/widget/dictquicklookup.lua | 67 +- frontend/ui/wikipedia.lua | 752 +++++++++++++++++- 5 files changed, 821 insertions(+), 15 deletions(-) diff --git a/base b/base index 243533e95..e8a24fe9b 160000 --- a/base +++ b/base @@ -1 +1 @@ -Subproject commit 243533e95ffb61b841b70aa4d50fd80df9cbfcaa +Subproject commit e8a24fe9b99b4c4ceb6c9329648e7a2f4d10bc0c diff --git a/frontend/apps/reader/modules/readerhighlight.lua b/frontend/apps/reader/modules/readerhighlight.lua index 780bf9ea7..e78546141 100644 --- a/frontend/apps/reader/modules/readerhighlight.lua +++ b/frontend/apps/reader/modules/readerhighlight.lua @@ -357,6 +357,7 @@ function ReaderHighlight:onHoldRelease() callback = function() UIManager:scheduleIn(0.1, function() self:lookupWikipedia() + self:onClose() end) end, }, diff --git a/frontend/apps/reader/modules/readerwikipedia.lua b/frontend/apps/reader/modules/readerwikipedia.lua index 9372269de..8b97473e7 100644 --- a/frontend/apps/reader/modules/readerwikipedia.lua +++ b/frontend/apps/reader/modules/readerwikipedia.lua @@ -101,7 +101,7 @@ function ReaderWikipedia:initLanguages(word) end end -function ReaderWikipedia:onLookupWikipedia(word, box, get_fullpage) +function ReaderWikipedia:onLookupWikipedia(word, box, get_fullpage, forced_lang) if not NetworkMgr:isOnline() then NetworkMgr:promptWifiOn() return @@ -109,8 +109,14 @@ function ReaderWikipedia:onLookupWikipedia(word, box, get_fullpage) -- word is the text to query. If get_fullpage is true, it is the -- exact wikipedia page title we want the full page of. self:initLanguages(word) - -- use first lang from self.wiki_languages, which may have been rotated by DictQuickLookup - local lang = self.wiki_languages[1] + local lang + if forced_lang then + -- use provided lang (from readerlink when noticing that an external link is a wikipedia url) + lang = forced_lang + else + -- use first lang from self.wiki_languages, which may have been rotated by DictQuickLookup + lang = self.wiki_languages[1] + end logger.dbg("lookup word:", word, box, get_fullpage) -- no need to clean word if get_fullpage, as it is the exact wikipetia page title if word and not get_fullpage then @@ -166,6 +172,7 @@ function ReaderWikipedia:onLookupWikipedia(word, box, get_fullpage) word = page.title, definition = definition, is_fullpage = get_fullpage, + lang = lang, } table.insert(results, result) end @@ -179,6 +186,7 @@ function ReaderWikipedia:onLookupWikipedia(word, box, get_fullpage) word = word, definition = self.no_page, is_fullpage = get_fullpage, + lang = lang, } } logger.dbg("dummy result table:", word, results) diff --git a/frontend/ui/widget/dictquicklookup.lua b/frontend/ui/widget/dictquicklookup.lua index 263e4d086..c3772b36b 100644 --- a/frontend/ui/widget/dictquicklookup.lua +++ b/frontend/ui/widget/dictquicklookup.lua @@ -19,6 +19,7 @@ local Device = require("device") local Geom = require("ui/geometry") local Event = require("ui/event") local Font = require("ui/font") +local util = require("util") local logger = require("logger") local _ = require("gettext") local T = require("ffi/util").template @@ -245,10 +246,67 @@ function DictQuickLookup:update() -- Different sets of buttons if fullpage or not local buttons if self.is_fullpage then - -- Only a single wide close button, get a little more room for - -- closing by taping at bottom (on footer or on this button) + -- A save and a close button buttons = { { + { + text = "Save as epub", + callback = function() + local InfoMessage = require("ui/widget/infomessage") + local ConfirmBox = require("ui/widget/confirmbox") + -- if forced_lang was specified, it may not be in our wiki_languages, + -- but ReaderWikipedia will have put it in result.lang + local lang = self.lang or self.wiki_languages_copy[1] + -- Just to be safe (none of the invalid chars, except ':' for uninteresting + -- Portal: or File: wikipedia pages, should be in lookup_word) + local cleaned_lookupword = util.replaceInvalidChars(self.lookupword) + local filename = cleaned_lookupword .. "."..string.upper(lang)..".epub" + -- Find a directory to save file into + local dir = G_reader_settings:readSetting("wikipedia_save_dir") + if not dir then dir = G_reader_settings:readSetting("download_dir") end -- OPDS dir + if not dir then dir = G_reader_settings:readSetting("home_dir") end + if not dir then dir = G_reader_settings:readSetting("lastdir") end + if not dir then + UIManager:show(InfoMessage:new{ + text = _("No directory to save page to !"), + }) + return + end + local epub_path = dir .. "/" .. filename + UIManager:show(ConfirmBox:new{ + text = T(_("Save as %1 ?"), filename), + ok_callback = function() + UIManager:scheduleIn(0.1, function() + local Wikipedia = require("ui/wikipedia") + Wikipedia:createEpubWithUI(epub_path, self.lookupword, lang, function(success) + if success then + UIManager:show(ConfirmBox:new{ + text = T(_("Page saved to:\n%1\n\nWould you like to read the downloaded page now?"), epub_path), + ok_callback = function() + -- close all dict/wiki windows, without scheduleIn(highlight.clear()) + self:onHoldClose(true) + -- close current ReaderUI in 1 sec, and create a new one + UIManager:scheduleIn(1.0, function() + local ReaderUI = require("apps/reader/readerui") + local reader = ReaderUI:_getRunningInstance() + if reader then + reader:onClose() + end + ReaderUI:showReader(epub_path) + end) + end, + }) + else + UIManager:show(InfoMessage:new{ + text = _("Failed saving Wikipedia page."), + }) + end + end) + end) + end + }) + end, + }, { text = "Close", callback = function() @@ -457,6 +515,7 @@ function DictQuickLookup:changeDictionary(index) self.lookupword = self.results[index].word self.definition = self.results[index].definition self.is_fullpage = self.results[index].is_fullpage + self.lang = self.results[index].lang if self.is_fullpage then self.displayword = self.lookupword else @@ -546,12 +605,12 @@ function DictQuickLookup:onClose() return true end -function DictQuickLookup:onHoldClose() +function DictQuickLookup:onHoldClose(no_clear) self:onClose() for i = #self.window_list, 1, -1 do local window = self.window_list[i] -- if one holds a highlight, let's clear it like in onClose() - if window.highlight then + if window.highlight and not no_clear then UIManager:scheduleIn(1, function() window.highlight:clear() end) diff --git a/frontend/ui/wikipedia.lua b/frontend/ui/wikipedia.lua index 7437a2a4e..498b36301 100644 --- a/frontend/ui/wikipedia.lua +++ b/frontend/ui/wikipedia.lua @@ -1,10 +1,18 @@ local JSON = require("json") local logger = require("logger") +local util = require("ffi/util") +local _ = require("gettext") +local T = require("ffi/util").template --[[ -- Query wikipedia using Wikimedia Web API. -- https://en.wikipedia.org/w/api.php?format=jsonfm&action=query&generator=search&gsrnamespace=0&gsrsearch=ereader&gsrlimit=10&prop=extracts&exintro&explaintext&exlimit=max -- https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=jsonfm&explaintext=&redirects=&titles=E-reader +-- +-- To get parsed HTML : +-- https://en.wikipedia.org/w/api.php?action=parse&page=E-book +-- https://en.wikipedia.org/w/api.php?action=parse&page=E-book&prop=text|sections|displaytitle|revid&disablelimitreport=&disableeditsection +-- https://www.mediawiki.org/wiki/API:Parsing_wikitext#parse --]] local Wikipedia = { @@ -37,16 +45,33 @@ local Wikipedia = { -- (otherwise, we get the full text for only the first result, and -- no text at all for the others }, + wiki_phtml_params = { + action = "parse", + format = "json", + -- we only need the following informations + prop = "text|sections|displaytitle|revid", + -- page = nil, -- text to lookup, will be added below + -- disabletoc = "", -- if we want to remove toc IN html + disablelimitreport = "", + disableeditsection = "", + }, + -- allow for disabling prettifying full page text + wiki_prettify = G_reader_settings:nilOrTrue("wikipedia_prettify"), } function Wikipedia:getWikiServer(lang) return string.format(self.wiki_server, lang or self.default_lang) end +-- Possible values for page_type parameter to loadPage() +local WIKIPEDIA_INTRO = 1 +local WIKIPEDIA_FULL = 2 +local WIKIPEDIA_PHTML = 3 + --[[ -- return decoded JSON table from Wikipedia --]] -function Wikipedia:loadPage(text, lang, intro, plain) +function Wikipedia:loadPage(text, lang, page_type, plain) local socket = require('socket') local url = require('socket.url') local http = require('socket.http') @@ -58,18 +83,25 @@ function Wikipedia:loadPage(text, lang, intro, plain) local parsed = url.parse(self:getWikiServer(lang)) parsed.path = self.wiki_path - if intro == true then -- search query + if page_type == WIKIPEDIA_INTRO then -- search query self.wiki_search_params.explaintext = plain and "" or nil for k,v in pairs(self.wiki_search_params) do - query = query .. k .. '=' .. v .. '&' + query = string.format("%s%s=%s&", query, k, v) end parsed.query = query .. "gsrsearch=" .. url.escape(text) - else -- full page content + elseif page_type == WIKIPEDIA_FULL then -- full page content self.wiki_params.explaintext = plain and "" or nil for k,v in pairs(self.wiki_params) do - query = query .. k .. '=' .. v .. '&' + query = string.format("%s%s=%s&", query, k, v) end parsed.query = query .. "titles=" .. url.escape(text) + elseif page_type == WIKIPEDIA_PHTML then -- parsed html page content + for k,v in pairs(self.wiki_phtml_params) do + query = string.format("%s%s=%s&", query, k, v) + end + parsed.query = query .. "page=" .. url.escape(text) + else + return end -- HTTP request @@ -107,7 +139,7 @@ end -- search wikipedia and get intros for results function Wikipedia:wikintro(text, lang) - local result = self:loadPage(text, lang, true, true) + local result = self:loadPage(text, lang, WIKIPEDIA_INTRO, true) if result then local query = result.query if query then @@ -118,14 +150,720 @@ end -- get full content of a wiki page function Wikipedia:wikifull(text, lang) - local result = self:loadPage(text, lang, false, true) + local result = self:loadPage(text, lang, WIKIPEDIA_FULL, true) if result then local query = result.query if query then + if self.wiki_prettify then + -- Prettification of the plain text full page + for pageid, page in pairs(query.pages) do + if page.extract then + page.extract = self:prettifyText(page.extract) + end + end + end return query.pages end end end +-- get parsed html content and other infos of a wiki page +function Wikipedia:wikiphtml(text, lang) + local result = self:loadPage(text, lang, WIKIPEDIA_PHTML, true) + if result and result.parse then + return result.parse + end + if result.error and result.error.info then + error(result.error.info) + end +end + +-- UTF8 of unicode geometrical shapes we can use to replace +-- the "=== title ===" of wkipedia plaintext pages +-- These chosen ones are available in most fonts (prettier symbols +-- exist in unicode, but are available in a few fonts only) and +-- have a quite consistent size/weight in all fonts. +local th1_sym = "\xE2\x96\x88" -- full block (big black rectangle) (never met, only for web page title?) +local th2_sym = "\xE2\x96\x89" -- big black square +local th3_sym = "\xC2\xA0\xE2\x97\x86" -- black diamond (indented, nicer) +local th4_sym = "\xE2\x97\xA4" -- black upper left triangle +local th5_sym = "\xE2\x9C\xBF" -- black florette +local th6_sym = "\xE2\x9D\x96" -- black diamond minus white x +-- Others available in most fonts +-- local thX_sym = "\xE2\x9C\x9A" -- heavy greek cross +-- local thX_sym = "\xE2\x97\xA2" -- black lower right triangle +-- local thX_sym = "\xE2\x97\x89" -- fish eye +-- local thX_sym = "\xE2\x96\x97" -- quadrant lower right + +-- For optional prettification of the plain text full page +function Wikipedia:prettifyText(text) + -- We use \a for an additional leading \n that we don't want shortened later + text = text:gsub("\n= ", "\n\a"..th1_sym.." ") -- 2 empty lines before + text = text:gsub("\n== ", "\n\a"..th2_sym.." ") -- 2 empty lines before + text = text:gsub("\n=== ", "\n"..th3_sym.." ") + text = text:gsub("\n==== ", "\n"..th4_sym.." ") + text = text:gsub("\n===== ", "\n"..th5_sym.." ") + text = text:gsub("\n====== ", "\n"..th6_sym.." ") + text = text:gsub("Modifier ==", " ==") -- fr wikipedia fix for some articles modified by clumsy editors + text = text:gsub("==$", "==\n") -- for a at end of text to be matched by next gsub + text = text:gsub(" ===?\n+", "\n\n") -- to : empty line after + text = text:gsub(" ====+\n+", "\n") -- to : single \n, no empty line + text = text:gsub("\n\n+\xE2\x80\x94", "\n\xE2\x80\x94") -- em dash, used for quote author, make it stick to prev text + text = text:gsub("\n +\n", "\n") -- trim lines full of only spaces (often seen in math formulas) + text = text:gsub("^\n*", "") -- trim new lines at start + text = text:gsub("\n*$", "") -- trim new lines at end + text = text:gsub("\n\n+", "\n\n") -- shorten multiple new lines + text = text:gsub("\a", "\n") -- re-add our wished \n + return text +end + + +local function getUrlContent(url, timeout) + local socket = require('socket') + local ltn12 = require('ltn12') + local requester + if url:sub(1,7) == "http://" then + requester = require('socket.http') + elseif url:sub(1,8) == "https://" then + requester = require('ssl.https') + else + return false, "Unsupported protocol" + end + requester.TIMEOUT = timeout or 10 + local request = {} + local sink = {} + request['url'] = url + request['method'] = 'GET' + request['sink'] = ltn12.sink.table(sink) + -- first argument returned by skip is code + local _, headers, status = socket.skip(1, requester.request(request)) + + if headers == nil then + logger.warn("No HTTP headers") + return false, "Network unavailable" + end + if status ~= "HTTP/1.1 200 OK" then + logger.warn("HTTP status not okay:", status) + return false, "Network unavailable" + end + + return true, table.concat(sink) +end + +-- UTF8 of unicode geometrical shapes we'll prepend to wikipedia section headers, +-- to help identifying hierarchy (othewise, the small font size differences helps). +-- Best if identical to the ones used above for prettifying full plain text page. +-- These chosen ones are available in most fonts (prettier symbols +-- exist in unicode, but are available in a few fonts only) and +-- have a quite consistent size/weight in all fonts. +local h1_sym = "\xE2\x96\x88" -- full block (big black rectangle) (never met, only for web page title?) +local h2_sym = "\xE2\x96\x89" -- big black square +local h3_sym = "\xE2\x97\x86" -- black diamond +local h4_sym = "\xE2\x97\xA4" -- black upper left triangle +local h5_sym = "\xE2\x9C\xBF" -- black florette +local h6_sym = "\xE2\x9D\x96" -- black diamond minus white x +-- Other available ones in most fonts +-- local hXsym = "\xE2\x9C\x9A" -- heavy greek cross +-- local hXsym = "\xE2\x97\xA2" -- black lower right triangle +-- local hXsym = "\xE2\x97\x89" -- fish eye +-- local hXsym = "\xE2\x96\x97" -- quadrant lower right + +local ext_to_mimetype = { + png = "image/png", + jpg = "image/jpeg", + jpeg = "image/jpeg", + gif = "image/gif", + svg = "image/svg+xml", + html= "application/xhtml+xml", + xhtml= "application/xhtml+xml", + ncx = "application/x-dtbncx+xml", + js = "text/javascript", + css = "text/css", + otf = "application/opentype", + ttf = "application/truetype", + woff = "application/font-woff", +} + + +-- Create an epub file (with possibly images) +-- This is non-UI code (for batch creation or emulator test), but it accepts +-- a progress_callback function that will be feed with progress information +-- that could be shown to the user. +function Wikipedia:createEpub(epub_path, page, lang, with_images, progress_callback) + if not progress_callback then + -- Make our own logging only process_callback + progress_callback = function(text, confirm) + logger.info("progress", confirm and "confirm" or "info", text) + return true -- always select "OK" in ConfirmBox + end + end + + progress_callback(_("Fetching Wikipedia page...")) + local ok, phtml = pcall(self.wikiphtml, self, page, lang) + if not ok then + progress_callback(phtml) + -- Sleep a bit to make that error seen + util.sleep(2) + progress_callback() -- close last progress info + return false + end + + -- Get infos from wikipedia result + -- (see example at https://en.wikipedia.org/w/api.php?action=parse&page=E-book&prop=text|sections|displaytitle|revid&disablelimitreport=&disableeditsection) + local cancelled = false + local html = phtml.text["*"] -- html content + local page_cleaned = page:gsub("_", " ") -- page title + local page_htmltitle = phtml.displaytitle -- page title with possible tags + local sections = phtml.sections -- Wikipedia provided TOC + local bookid = string.format("wikipedia_%s_%s_%s", lang, phtml.pageid, phtml.revid) + -- Not sure if this bookid may ever be used by indexing software/calibre, but if it is, + -- should it changes if content is updated (as now, including the wikipedia revisionId), + -- or should it stays the same even if revid changes (content of the same book updated). + + -- We need to find images in HTML to tell how many when asking user if they should be included + local images = {} + local seen_images = {} + local imagenum = 1 + local cover_imgid = "" -- best candidate for cover among our images + local processImg = function(img_tag) + local src = img_tag:match([[src="([^"]*)"]]) + if src == nil or src == "" then + logger.info("no src found in ", img_tag) + return nil + end + if src:sub(1,2) == "//" then + src = "https:" .. src -- Wikipedia redirects from http to https, so use https + end + local cur_image + if seen_images[src] then -- already seen + cur_image = seen_images[src] + else + local ext = src:match(".*%.(%S+)") + if ext == nil or ext == "" then -- we won't know what mimetype to use, ignore it + logger.info("no file extension found in ", src) + return nil + end + ext = ext:lower() + local imgid = string.format("img%05d", imagenum) + local imgpath = string.format("images/%s.%s", imgid, ext) + local mimetype = ext_to_mimetype[ext] or "" + local width = img_tag:match([[width="([^"]*)"]]) + local height = img_tag:match([[height="([^"]*)"]]) + -- Get higher resolution (2x) image url + local src2x = nil + local srcset = img_tag:match([[srcset="([^"]*)"]]) + if srcset then + srcset = " "..srcset.. ", " -- for next pattern to possibly match 1st or last item + src2x = srcset:match([[ (%S+) 2x, ]]) + if src2x and src2x:sub(1,2) == "//" then + src2x = "https:" .. src2x + end + end + cur_image = { + imgid = imgid, + imgpath = imgpath, + src = src, + src2x = src2x, + mimetype = mimetype, + width = width, + height = height, + } + table.insert(images, cur_image) + seen_images[src] = cur_image + -- Use first image of reasonable size (not an icon) and portrait-like as cover-image + if cover_imgid == "" and tonumber(width) > 50 and tonumber(height) > 50 and tonumber(height) > tonumber(width) then + cover_imgid = imgid + end + imagenum = imagenum + 1 + end + -- crengine will NOT use width and height attributes, but it will use + -- those found in a style attribute. + -- If we get src2x images, crengine will scale them down to the 1x image size + -- (less space wasted by images while reading), but the 2x quality will be + -- there when image is viewed full screen with ImageViewer widget. + return string.format([[]], cur_image.imgpath, cur_image.width, cur_image.height) + end + html = html:gsub("(<%s*img [^>]*>)", processImg) + logger.dbg("Images found in html:", images) + + -- See what to do with images + local include_images = false + local use_img_2x = false + if with_images then + -- if no progress_callback (non UI), our fake one will return true + if #images > 0 then + include_images = progress_callback(T(_("Page contains %1 images.\nWould you like to download and include them in epub ?"), #images), true) + if include_images then + use_img_2x = progress_callback(_("Would you like to get slightly higher quality images (but bigger file size) ?"), true) + end + else + progress_callback(_("Page contains no image.")) + util.sleep(1) -- Let the user see that + end + end + if not include_images then + -- Remove img tags to avoid little blank squares of missing images + html = html:gsub("<%s*img [^>]*>", "") + -- We could remove the whole image container
, + -- but it's a lot of nested
and not easy to do. + -- So the user will see the image legends and know a bit about + -- the images he chose to not get. + end + + -- Open the zip file (with .tmp for now, as crengine may still + -- have a handle to the final epub_path, and we don't want to + -- delete a good one if we fail/cancel later) + local epub_path_tmp = epub_path .. ".tmp" + local ZipWriter = require("ffi/zipwriter") + local epub = ZipWriter:new{} + if not epub:open(epub_path_tmp) then + return false + end + + -- We now create and add all the required epub files + + -- ---------------------------------------------------------------- + -- /mimetype : always "application/epub+zip" + epub:add("mimetype", "application/epub+zip") + + -- ---------------------------------------------------------------- + -- /META-INF/container.xml : always the same content + epub:add("META-INF/container.xml", [[ + + + + + +]]) + + -- ---------------------------------------------------------------- + -- OEBPS/content.opf : metadata + list of other files (paths relative to OEBPS/ directory) + -- Other possible items in this file that are of no interest to crengine : + -- In : + -- + -- + -- (crengine only uses to get the cover image) + -- In : + -- + -- And a section : + -- + -- + -- + -- + local koreader_version = "KOReader" + if lfs.attributes("git-rev", "mode") == "file" then + koreader_version = "KOReader "..io.open("git-rev", "r"):read() + end + local content_opf_parts = {} + -- head + table.insert(content_opf_parts, string.format([[ + + + + %s + Wikipedia %s + %s + %s + %s + + + + + + +]], page_cleaned, lang:upper(), bookid, lang, koreader_version, cover_imgid)) + -- images files + if include_images then + for inum, img in ipairs(images) do + table.insert(content_opf_parts, string.format([[ %s]], img.imgid, img.imgpath, img.mimetype, "\n")) + end + end + -- tail + table.insert(content_opf_parts, [[ + + + + + +]]) + epub:add("OEBPS/content.opf", table.concat(content_opf_parts)) + + -- ---------------------------------------------------------------- + -- OEBPS/stylesheet.css + -- crengine will use its own data/epub.css, we just add/fix a few styles + -- to look more alike wikipedia web pages (that the user can ignore + -- with "Embedded Style" off) + epub:add("OEBPS/stylesheet.css", [[ +/* make section headers looks left aligned and avoid some page breaks */ +h1, h2 { + text-align: left; +} +h3, h4, h5, h6, h7 { + page-break-before: avoid; + page-break-after: avoid; + text-align: left; +} +/* avoid page breaks around our centered titles on first page */ +h1.koreaderwikifrontpage, h5.koreaderwikifrontpage { + page-break-before: avoid; + page-break-inside: avoid; + page-break-after: avoid; + text-align: center; + margin-top: 0em; +} +p.koreaderwikifrontpage { + font-style: italic; + font-size: 90%; + margin-left: 2em; + margin-right: 2em; + margin-top: 1em; + margin-bottom: 1em; +} +hr.koreaderwikifrontpage { + margin-left: 20%; + margin-right: 20%; + margin-bottom: 1.2em; +} +/* So many links, make them look like normal text except for underline */ +a { + display:inline; + text-decoration: underline; + color: black, + font-weight: normal; +} +/* No underline for links without their href that we removed */ +a.newwikinonexistent { + text-decoration: none; +} +/* show a box around image thumbnails */ +div.thumb { + width: 80%; + border: dotted 1px black; + margin-top: 0.5em; + margin-bottom: 0.5em; + margin-left: 2.5em; + margin-right: 2.5em; + padding-top: ]].. (include_images and "0.5em" or "0.15em") .. [[; + padding-bottom: 0.2em; + padding-left: 0.5em; + padding-right: 0.5em; + text-align: center; + font-size: 90%; +} +/* don't waste left margin for notes and list of pages */ +ul, ol { + margin-left: 0em; +} +/* helps crengine to not display them as block elements */ +time, abbr, sup { + display: inline; +} +]]) + + -- ---------------------------------------------------------------- + -- OEBPS/toc.ncx : table of content + local toc_ncx_parts = {} + local depth = 0 + local cur_level = 0 + local np_end = [[]] + local num = 1 + -- Add our own first section for first page, with page name as title + table.insert(toc_ncx_parts, string.format([[%s]], num, num, page_cleaned)) + table.insert(toc_ncx_parts, np_end) + -- Wikipedia sections items seem to be already sorted by index, so no need to sort + for isec, s in ipairs(sections) do + num = num + 1 + local s_anchor = s.anchor + local s_title = string.format("%s %s", s.number, s.line) + s_title = (s_title:gsub("(%b<>)", "")) -- titles may include and other html tags + local s_level = s.toclevel + if s_level > depth then + depth = s_level -- max depth required in toc.ncx + end + if s_level == cur_level then + table.insert(toc_ncx_parts, np_end) -- close same-level previous navPoint + elseif s_level < cur_level then + table.insert(toc_ncx_parts, np_end) -- close same-level previous navPoint + while s_level < cur_level do -- close all in-between navPoint + table.insert(toc_ncx_parts, np_end) + cur_level = cur_level - 1 + end + elseif s_level > cur_level + 1 then + -- a jump from level N to level N+2 or more ... should not happen + -- per epub spec, but we don't know about wikipedia... + -- so we create missing intermediate navPoints with same anchor as current section + while s_level > cur_level + 1 do + table.insert(toc_ncx_parts, "\n"..(" "):rep(cur_level)) + table.insert(toc_ncx_parts, string.format([[-]], num, num, s_anchor)) + cur_level = cur_level + 1 + num = num + 1 + end + -- elseif s_level == cur_level + 1 then + -- sublevel, nothing to close, nothing to add + end + cur_level = s_level + table.insert(toc_ncx_parts, "\n"..(" "):rep(cur_level)) -- indentation, in case a person looks at it + table.insert(toc_ncx_parts, string.format([[%s]], num, num, s_title, s_anchor)) + end + -- close nested + while cur_level > 0 do + table.insert(toc_ncx_parts, np_end) + cur_level = cur_level - 1 + end + -- Prepend NCX head + table.insert(toc_ncx_parts, 1, string.format([[ + + + + + + + + + + + %s + + +]], bookid, depth, page_cleaned)) + -- Append NCX tail + table.insert(toc_ncx_parts, [[ + + +]]) + epub:add("OEBPS/toc.ncx", table.concat(toc_ncx_parts)) + + -- ---------------------------------------------------------------- + -- OEBPS/content.html + -- Some small fixes to Wikipedia HTML to make crengine and the user happier + + -- Most images are in a link to the image info page, which is a useless + -- external link for us, so let's remove this link. + html = html:gsub("]*>%s*(<%s*img [^>]*>)%s*", "%1") + + -- For some
, which include nested divs, although + -- perfectly balanced, crengine seems to miss some closing
and we + -- end up having our image bordered box including the remaining main wiki text. + -- It looks like this code is supposed to deal with class= containing multiple + -- class names : + -- https://github.com/koreader/crengine/commit/0930ec7230e720c148fd6f231d69558832b4d53a + -- and that it may stumble on some cases. + -- It's all perfectly fine if we make all these div with a single class name + -- html = html:gsub([[
]], [[
]]) + -- + -- But we may as well make all class= have a single name to avoid other problems + -- (no real risk with that, as we don't define any style for wikipedia class names, + -- except div.thumb that always appears first). + html = html:gsub([[(<[^>]* class="[^ "]+)%s+[^"]*"]], [[%1"]]) + + -- crengine seems to consider unknown tag as 'block' elements, so we may + -- want to remove or replace those that should be considered 'inline' elements + html = html:gsub("]*>", "") + + -- Fix internal wikipedia links with full server url (including lang) so + -- ReaderLink can notice them and deal with them with a LookupWikipedia event. + local wiki_base_url = self:getWikiServer(lang) + -- html = html:gsub([[href="/wiki/]], [[href="]]..wiki_base_url..[[/wiki/]]) + -- + -- Also, crengine deals strangely with percent encoded utf8 : + -- if the link in the html is : + -- we get from credocument:getLinkFromPosition() : http://fr.wikipedia.org/wiki/Françoix + -- These are bytes "\xc3\x83\xc2\xa7", that is U+C3 and U+A7 encoded as UTF8, + -- when we should have get "\xc3\xa7" ... + -- We can avoid that by putting in the url plain unencoded UTF8 + local hex_to_char = function(x) return string.char(tonumber(x, 16)) end + local fixEncodedWikiPageTitle = function(wiki_page) + wiki_page = wiki_page:gsub("%%(%x%x)", hex_to_char) + return string.format([[href="%s/wiki/%s"]], wiki_base_url, wiki_page) + end + html = html:gsub([[href="/wiki/([^"]*)"]], fixEncodedWikiPageTitle) + + -- Remove href from links to non existant wiki page so they are not clickable : + -- PageTitle©on + -- (removal of the href="" will make them non clickable) + html = html:gsub([[]* class="new"[^>]*>]], [[]]) + + -- Fix some other protocol-less links to wikipedia (href="//fr.wikipedia.org/w/index.php..) + html = html:gsub([[href="//]], [[href="https://]]) + + -- crengine does not return link if multiple class names in () + -- it would be no problem as we can't follow them, but when the user tap + -- on it, the tap is propagated to other widgets and page change happen... + -- html = html:gsub([[ (if it starts a line) or after (if it + -- ends a line or a block) by wrapping it with U+200B ZERO WIDTH SPACE which will + -- make the DOM tree walking code to find a link stop at it. + -- html = html:gsub("(<[aA])", "\xE2\x80\x8B%1") + -- html = html:gsub("()", "%1\xE2\x80\x8B") + -- Fixed in crengine lvtinydom. + + if self.wiki_prettify then + -- Prepend some symbols to section titles for a better visual feeling of hierarchy + html = html:gsub("

", "

"..h1_sym.." ") + html = html:gsub("

", "

"..h2_sym.." ") + html = html:gsub("

", "

"..h3_sym.." ") + html = html:gsub("

", "

"..h4_sym.." ") + html = html:gsub("

", "
"..h5_sym.." ") + html = html:gsub("
", "
"..h6_sym.." ") + end + + -- Note: in all the gsub patterns above, we used lowercase for tags and attributes + -- because it's how they are in wikipedia HTML and it makes the pattern simple. + -- If one day this changes, they'll have to be replaced with href => [Hh][Rr][Ee][Ff] ... + + -- We can finally build the final HTML with some header of our own + local saved_on = T(_("Saved on %1"), os.date("%b %d, %Y %H:%M:%S")) + local online_version_htmllink = string.format([[%s]], wiki_base_url, page:gsub(" ", "_"), _("online version")) + local see_online_version = T(_("See %1 for up-to-date content"), online_version_htmllink) + epub:add("OEBPS/content.html", string.format([[ + + + %s + + + +

%s

+
Wikipedia %s
+

%s
%s

+
+%s + + +]], page_cleaned, page_htmltitle, lang:upper(), saved_on, see_online_version, html)) + + -- ---------------------------------------------------------------- + -- OEBPS/images/* + if include_images then + local nb_images = #images + for inum, img in ipairs(images) do + progress_callback(T(_("Fetching image %1 / %2 ..."), inum, nb_images)) + local src = img.src + if use_img_2x and img.src2x then + src = img.src2x + end + logger.dbg("Getting img ", src) + local success, content = getUrlContent(src) + -- success, content = getUrlContent(src..".unexistant") -- to simulate failure + if success then + logger.dbg("success, size:", #content) + else + logger.info("failed fetching:", src) + end + if success then + epub:add("OEBPS/"..img.imgpath, content) + else + local go_on = progress_callback(T(_("Failed getting image %1, continue anyway ?"), inum), true) + if not go_on then + cancelled = true + break + end + end + end + end + + -- Done with adding files + if cancelled then + progress_callback(_("Cleaning up...")) + else + progress_callback(_("Packing epub...")) + end + epub:close() + -- This was nearly a no-op, so sleep a bit to make that progress step seen + util.usleep(300000) + progress_callback() -- close last progress info + + if cancelled then + -- Build was cancelled, remove half created .epub + if lfs.attributes(epub_path_tmp, "mode") == "file" then + os.remove(epub_path_tmp) + end + return false + end + + -- Finally move the .tmp to the final file + os.rename(epub_path_tmp, epub_path) + logger.info("successfully created:", epub_path) + return true +end + + +-- Wrapper to Wikipedia:createEpub() with UI progress info +function Wikipedia:createEpubWithUI(epub_path, page, lang, result_callback) + -- For progress_callback to be able to wait when needed + -- for user confirmation, we need to wrap Wikipedia:createEpub + -- in a coroutine, that can be resumed by these confirm callbacks. + local UIManager = require("ui/uimanager") + local InfoMessage = require("ui/widget/infomessage") + local ConfirmBox = require("ui/widget/confirmbox") + + -- Visual progress callback + local cur_progress_box = nil + local function ui_progress_callback(text, confirmbox) + if cur_progress_box then + -- close previous progress info + UIManager:close(cur_progress_box) + -- no repaint here, we'll do that below when new stuff is shown + end + if not text then + -- no text given, used to just close previous progress info when done + -- a repaint is needed + UIManager:forceRePaint() + return true + end + if confirmbox then + -- ConfirmBox requested: callbacks will resume coroutine + local _coroutine = coroutine.running() + cur_progress_box = ConfirmBox:new{ + text = text, + ok_callback = function() + coroutine.resume(_coroutine, true) + end, + cancel_callback = function() + coroutine.resume(_coroutine, false) + end, + } + else + -- simple InfoMessage requested + cur_progress_box = InfoMessage:new{text = text} + end + logger.dbg("Showing", confirmbox and "ConfirmBox" or "InfoMessage", text) + UIManager:show(cur_progress_box) + UIManager:forceRePaint() + if not confirmbox then + return true -- nothing more to do + end + -- we need to wait for ConfirmBox callback + logger.dbg("waiting for coroutine to resume") + if coroutine.running() then + local result = coroutine.yield() + logger.dbg(" coroutine ran and returned", result) + return result + end + end + + -- Coroutine wrapping Wikipedia:createEpub() + local co = coroutine.create(function() + -- If errors in Wikipedia:createEpub(), the coroutine + -- would just abort without crashing the reader, so + -- pcall would not be needed. But if that happens, + -- pcall will let us know and returns the error, + -- that we can log. + local ok, success = pcall(self.createEpub, self, epub_path, page, lang, true, ui_progress_callback) + if ok and success then + result_callback(true) + else + ui_progress_callback() -- close any last progress info not cleaned + logger.warn("Wikipedia.createEpub pcall:", ok, success) + result_callback(false) + end + end) + -- Execute coroutine + coroutine.resume(co) +end return Wikipedia