Wikipedia: save full page as epub (html, with optional images)

Also for the current text-only wikipedia full page: replace
the ==s in section titles with some unicode symbols for a better
visual feeling of hierarchy. These same symbols are also used
in the .epub.
Both can be disabled by adding ["wikipedia_prettify"] = false
to settings.reader.lua

readerhighlight: close ButtonTable (like other buttons do) when looking
up wikipedia, otherwise if we save as epub, and swtich to new document,
this ButtonTable will never be closed and stays in UI.
pull/2516/head
poire-z 7 years ago committed by Qingping Hou
parent 81bc115cee
commit 2ded2bce5e

@ -1 +1 @@
Subproject commit 243533e95ffb61b841b70aa4d50fd80df9cbfcaa
Subproject commit e8a24fe9b99b4c4ceb6c9329648e7a2f4d10bc0c

@ -357,6 +357,7 @@ function ReaderHighlight:onHoldRelease()
callback = function()
UIManager:scheduleIn(0.1, function()
self:lookupWikipedia()
self:onClose()
end)
end,
},

@ -101,7 +101,7 @@ function ReaderWikipedia:initLanguages(word)
end
end
function ReaderWikipedia:onLookupWikipedia(word, box, get_fullpage)
function ReaderWikipedia:onLookupWikipedia(word, box, get_fullpage, forced_lang)
if not NetworkMgr:isOnline() then
NetworkMgr:promptWifiOn()
return
@ -109,8 +109,14 @@ function ReaderWikipedia:onLookupWikipedia(word, box, get_fullpage)
-- word is the text to query. If get_fullpage is true, it is the
-- exact wikipedia page title we want the full page of.
self:initLanguages(word)
-- use first lang from self.wiki_languages, which may have been rotated by DictQuickLookup
local lang = self.wiki_languages[1]
local lang
if forced_lang then
-- use provided lang (from readerlink when noticing that an external link is a wikipedia url)
lang = forced_lang
else
-- use first lang from self.wiki_languages, which may have been rotated by DictQuickLookup
lang = self.wiki_languages[1]
end
logger.dbg("lookup word:", word, box, get_fullpage)
-- no need to clean word if get_fullpage, as it is the exact wikipetia page title
if word and not get_fullpage then
@ -166,6 +172,7 @@ function ReaderWikipedia:onLookupWikipedia(word, box, get_fullpage)
word = page.title,
definition = definition,
is_fullpage = get_fullpage,
lang = lang,
}
table.insert(results, result)
end
@ -179,6 +186,7 @@ function ReaderWikipedia:onLookupWikipedia(word, box, get_fullpage)
word = word,
definition = self.no_page,
is_fullpage = get_fullpage,
lang = lang,
}
}
logger.dbg("dummy result table:", word, results)

@ -19,6 +19,7 @@ local Device = require("device")
local Geom = require("ui/geometry")
local Event = require("ui/event")
local Font = require("ui/font")
local util = require("util")
local logger = require("logger")
local _ = require("gettext")
local T = require("ffi/util").template
@ -245,10 +246,67 @@ function DictQuickLookup:update()
-- Different sets of buttons if fullpage or not
local buttons
if self.is_fullpage then
-- Only a single wide close button, get a little more room for
-- closing by taping at bottom (on footer or on this button)
-- A save and a close button
buttons = {
{
{
text = "Save as epub",
callback = function()
local InfoMessage = require("ui/widget/infomessage")
local ConfirmBox = require("ui/widget/confirmbox")
-- if forced_lang was specified, it may not be in our wiki_languages,
-- but ReaderWikipedia will have put it in result.lang
local lang = self.lang or self.wiki_languages_copy[1]
-- Just to be safe (none of the invalid chars, except ':' for uninteresting
-- Portal: or File: wikipedia pages, should be in lookup_word)
local cleaned_lookupword = util.replaceInvalidChars(self.lookupword)
local filename = cleaned_lookupword .. "."..string.upper(lang)..".epub"
-- Find a directory to save file into
local dir = G_reader_settings:readSetting("wikipedia_save_dir")
if not dir then dir = G_reader_settings:readSetting("download_dir") end -- OPDS dir
if not dir then dir = G_reader_settings:readSetting("home_dir") end
if not dir then dir = G_reader_settings:readSetting("lastdir") end
if not dir then
UIManager:show(InfoMessage:new{
text = _("No directory to save page to !"),
})
return
end
local epub_path = dir .. "/" .. filename
UIManager:show(ConfirmBox:new{
text = T(_("Save as %1 ?"), filename),
ok_callback = function()
UIManager:scheduleIn(0.1, function()
local Wikipedia = require("ui/wikipedia")
Wikipedia:createEpubWithUI(epub_path, self.lookupword, lang, function(success)
if success then
UIManager:show(ConfirmBox:new{
text = T(_("Page saved to:\n%1\n\nWould you like to read the downloaded page now?"), epub_path),
ok_callback = function()
-- close all dict/wiki windows, without scheduleIn(highlight.clear())
self:onHoldClose(true)
-- close current ReaderUI in 1 sec, and create a new one
UIManager:scheduleIn(1.0, function()
local ReaderUI = require("apps/reader/readerui")
local reader = ReaderUI:_getRunningInstance()
if reader then
reader:onClose()
end
ReaderUI:showReader(epub_path)
end)
end,
})
else
UIManager:show(InfoMessage:new{
text = _("Failed saving Wikipedia page."),
})
end
end)
end)
end
})
end,
},
{
text = "Close",
callback = function()
@ -457,6 +515,7 @@ function DictQuickLookup:changeDictionary(index)
self.lookupword = self.results[index].word
self.definition = self.results[index].definition
self.is_fullpage = self.results[index].is_fullpage
self.lang = self.results[index].lang
if self.is_fullpage then
self.displayword = self.lookupword
else
@ -546,12 +605,12 @@ function DictQuickLookup:onClose()
return true
end
function DictQuickLookup:onHoldClose()
function DictQuickLookup:onHoldClose(no_clear)
self:onClose()
for i = #self.window_list, 1, -1 do
local window = self.window_list[i]
-- if one holds a highlight, let's clear it like in onClose()
if window.highlight then
if window.highlight and not no_clear then
UIManager:scheduleIn(1, function()
window.highlight:clear()
end)

@ -1,10 +1,18 @@
local JSON = require("json")
local logger = require("logger")
local util = require("ffi/util")
local _ = require("gettext")
local T = require("ffi/util").template
--[[
-- Query wikipedia using Wikimedia Web API.
-- https://en.wikipedia.org/w/api.php?format=jsonfm&action=query&generator=search&gsrnamespace=0&gsrsearch=ereader&gsrlimit=10&prop=extracts&exintro&explaintext&exlimit=max
-- https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=jsonfm&explaintext=&redirects=&titles=E-reader
--
-- To get parsed HTML :
-- https://en.wikipedia.org/w/api.php?action=parse&page=E-book
-- https://en.wikipedia.org/w/api.php?action=parse&page=E-book&prop=text|sections|displaytitle|revid&disablelimitreport=&disableeditsection
-- https://www.mediawiki.org/wiki/API:Parsing_wikitext#parse
--]]
local Wikipedia = {
@ -37,16 +45,33 @@ local Wikipedia = {
-- (otherwise, we get the full text for only the first result, and
-- no text at all for the others
},
wiki_phtml_params = {
action = "parse",
format = "json",
-- we only need the following informations
prop = "text|sections|displaytitle|revid",
-- page = nil, -- text to lookup, will be added below
-- disabletoc = "", -- if we want to remove toc IN html
disablelimitreport = "",
disableeditsection = "",
},
-- allow for disabling prettifying full page text
wiki_prettify = G_reader_settings:nilOrTrue("wikipedia_prettify"),
}
function Wikipedia:getWikiServer(lang)
return string.format(self.wiki_server, lang or self.default_lang)
end
-- Possible values for page_type parameter to loadPage()
local WIKIPEDIA_INTRO = 1
local WIKIPEDIA_FULL = 2
local WIKIPEDIA_PHTML = 3
--[[
-- return decoded JSON table from Wikipedia
--]]
function Wikipedia:loadPage(text, lang, intro, plain)
function Wikipedia:loadPage(text, lang, page_type, plain)
local socket = require('socket')
local url = require('socket.url')
local http = require('socket.http')
@ -58,18 +83,25 @@ function Wikipedia:loadPage(text, lang, intro, plain)
local parsed = url.parse(self:getWikiServer(lang))
parsed.path = self.wiki_path
if intro == true then -- search query
if page_type == WIKIPEDIA_INTRO then -- search query
self.wiki_search_params.explaintext = plain and "" or nil
for k,v in pairs(self.wiki_search_params) do
query = query .. k .. '=' .. v .. '&'
query = string.format("%s%s=%s&", query, k, v)
end
parsed.query = query .. "gsrsearch=" .. url.escape(text)
else -- full page content
elseif page_type == WIKIPEDIA_FULL then -- full page content
self.wiki_params.explaintext = plain and "" or nil
for k,v in pairs(self.wiki_params) do
query = query .. k .. '=' .. v .. '&'
query = string.format("%s%s=%s&", query, k, v)
end
parsed.query = query .. "titles=" .. url.escape(text)
elseif page_type == WIKIPEDIA_PHTML then -- parsed html page content
for k,v in pairs(self.wiki_phtml_params) do
query = string.format("%s%s=%s&", query, k, v)
end
parsed.query = query .. "page=" .. url.escape(text)
else
return
end
-- HTTP request
@ -107,7 +139,7 @@ end
-- search wikipedia and get intros for results
function Wikipedia:wikintro(text, lang)
local result = self:loadPage(text, lang, true, true)
local result = self:loadPage(text, lang, WIKIPEDIA_INTRO, true)
if result then
local query = result.query
if query then
@ -118,14 +150,720 @@ end
-- get full content of a wiki page
function Wikipedia:wikifull(text, lang)
local result = self:loadPage(text, lang, false, true)
local result = self:loadPage(text, lang, WIKIPEDIA_FULL, true)
if result then
local query = result.query
if query then
if self.wiki_prettify then
-- Prettification of the plain text full page
for pageid, page in pairs(query.pages) do
if page.extract then
page.extract = self:prettifyText(page.extract)
end
end
end
return query.pages
end
end
end
-- get parsed html content and other infos of a wiki page
function Wikipedia:wikiphtml(text, lang)
local result = self:loadPage(text, lang, WIKIPEDIA_PHTML, true)
if result and result.parse then
return result.parse
end
if result.error and result.error.info then
error(result.error.info)
end
end
-- UTF8 of unicode geometrical shapes we can use to replace
-- the "=== title ===" of wkipedia plaintext pages
-- These chosen ones are available in most fonts (prettier symbols
-- exist in unicode, but are available in a few fonts only) and
-- have a quite consistent size/weight in all fonts.
local th1_sym = "\xE2\x96\x88" -- full block (big black rectangle) (never met, only for web page title?)
local th2_sym = "\xE2\x96\x89" -- big black square
local th3_sym = "\xC2\xA0\xE2\x97\x86" -- black diamond (indented, nicer)
local th4_sym = "\xE2\x97\xA4" -- black upper left triangle
local th5_sym = "\xE2\x9C\xBF" -- black florette
local th6_sym = "\xE2\x9D\x96" -- black diamond minus white x
-- Others available in most fonts
-- local thX_sym = "\xE2\x9C\x9A" -- heavy greek cross
-- local thX_sym = "\xE2\x97\xA2" -- black lower right triangle
-- local thX_sym = "\xE2\x97\x89" -- fish eye
-- local thX_sym = "\xE2\x96\x97" -- quadrant lower right
-- For optional prettification of the plain text full page
function Wikipedia:prettifyText(text)
-- We use \a for an additional leading \n that we don't want shortened later
text = text:gsub("\n= ", "\n\a"..th1_sym.." ") -- 2 empty lines before
text = text:gsub("\n== ", "\n\a"..th2_sym.." ") -- 2 empty lines before
text = text:gsub("\n=== ", "\n"..th3_sym.." ")
text = text:gsub("\n==== ", "\n"..th4_sym.." ")
text = text:gsub("\n===== ", "\n"..th5_sym.." ")
text = text:gsub("\n====== ", "\n"..th6_sym.." ")
text = text:gsub("Modifier ==", " ==") -- fr wikipedia fix for some articles modified by clumsy editors
text = text:gsub("==$", "==\n") -- for a </hN> at end of text to be matched by next gsub
text = text:gsub(" ===?\n+", "\n\n") -- </h2> to </h3> : empty line after
text = text:gsub(" ====+\n+", "\n") -- </h4> to </hN> : single \n, no empty line
text = text:gsub("\n\n+\xE2\x80\x94", "\n\xE2\x80\x94") -- em dash, used for quote author, make it stick to prev text
text = text:gsub("\n +\n", "\n") -- trim lines full of only spaces (often seen in math formulas)
text = text:gsub("^\n*", "") -- trim new lines at start
text = text:gsub("\n*$", "") -- trim new lines at end
text = text:gsub("\n\n+", "\n\n") -- shorten multiple new lines
text = text:gsub("\a", "\n") -- re-add our wished \n
return text
end
local function getUrlContent(url, timeout)
local socket = require('socket')
local ltn12 = require('ltn12')
local requester
if url:sub(1,7) == "http://" then
requester = require('socket.http')
elseif url:sub(1,8) == "https://" then
requester = require('ssl.https')
else
return false, "Unsupported protocol"
end
requester.TIMEOUT = timeout or 10
local request = {}
local sink = {}
request['url'] = url
request['method'] = 'GET'
request['sink'] = ltn12.sink.table(sink)
-- first argument returned by skip is code
local _, headers, status = socket.skip(1, requester.request(request))
if headers == nil then
logger.warn("No HTTP headers")
return false, "Network unavailable"
end
if status ~= "HTTP/1.1 200 OK" then
logger.warn("HTTP status not okay:", status)
return false, "Network unavailable"
end
return true, table.concat(sink)
end
-- UTF8 of unicode geometrical shapes we'll prepend to wikipedia section headers,
-- to help identifying hierarchy (othewise, the small font size differences helps).
-- Best if identical to the ones used above for prettifying full plain text page.
-- These chosen ones are available in most fonts (prettier symbols
-- exist in unicode, but are available in a few fonts only) and
-- have a quite consistent size/weight in all fonts.
local h1_sym = "\xE2\x96\x88" -- full block (big black rectangle) (never met, only for web page title?)
local h2_sym = "\xE2\x96\x89" -- big black square
local h3_sym = "\xE2\x97\x86" -- black diamond
local h4_sym = "\xE2\x97\xA4" -- black upper left triangle
local h5_sym = "\xE2\x9C\xBF" -- black florette
local h6_sym = "\xE2\x9D\x96" -- black diamond minus white x
-- Other available ones in most fonts
-- local hXsym = "\xE2\x9C\x9A" -- heavy greek cross
-- local hXsym = "\xE2\x97\xA2" -- black lower right triangle
-- local hXsym = "\xE2\x97\x89" -- fish eye
-- local hXsym = "\xE2\x96\x97" -- quadrant lower right
local ext_to_mimetype = {
png = "image/png",
jpg = "image/jpeg",
jpeg = "image/jpeg",
gif = "image/gif",
svg = "image/svg+xml",
html= "application/xhtml+xml",
xhtml= "application/xhtml+xml",
ncx = "application/x-dtbncx+xml",
js = "text/javascript",
css = "text/css",
otf = "application/opentype",
ttf = "application/truetype",
woff = "application/font-woff",
}
-- Create an epub file (with possibly images)
-- This is non-UI code (for batch creation or emulator test), but it accepts
-- a progress_callback function that will be feed with progress information
-- that could be shown to the user.
function Wikipedia:createEpub(epub_path, page, lang, with_images, progress_callback)
if not progress_callback then
-- Make our own logging only process_callback
progress_callback = function(text, confirm)
logger.info("progress", confirm and "confirm" or "info", text)
return true -- always select "OK" in ConfirmBox
end
end
progress_callback(_("Fetching Wikipedia page..."))
local ok, phtml = pcall(self.wikiphtml, self, page, lang)
if not ok then
progress_callback(phtml)
-- Sleep a bit to make that error seen
util.sleep(2)
progress_callback() -- close last progress info
return false
end
-- Get infos from wikipedia result
-- (see example at https://en.wikipedia.org/w/api.php?action=parse&page=E-book&prop=text|sections|displaytitle|revid&disablelimitreport=&disableeditsection)
local cancelled = false
local html = phtml.text["*"] -- html content
local page_cleaned = page:gsub("_", " ") -- page title
local page_htmltitle = phtml.displaytitle -- page title with possible <sup> tags
local sections = phtml.sections -- Wikipedia provided TOC
local bookid = string.format("wikipedia_%s_%s_%s", lang, phtml.pageid, phtml.revid)
-- Not sure if this bookid may ever be used by indexing software/calibre, but if it is,
-- should it changes if content is updated (as now, including the wikipedia revisionId),
-- or should it stays the same even if revid changes (content of the same book updated).
-- We need to find images in HTML to tell how many when asking user if they should be included
local images = {}
local seen_images = {}
local imagenum = 1
local cover_imgid = "" -- best candidate for cover among our images
local processImg = function(img_tag)
local src = img_tag:match([[src="([^"]*)"]])
if src == nil or src == "" then
logger.info("no src found in ", img_tag)
return nil
end
if src:sub(1,2) == "//" then
src = "https:" .. src -- Wikipedia redirects from http to https, so use https
end
local cur_image
if seen_images[src] then -- already seen
cur_image = seen_images[src]
else
local ext = src:match(".*%.(%S+)")
if ext == nil or ext == "" then -- we won't know what mimetype to use, ignore it
logger.info("no file extension found in ", src)
return nil
end
ext = ext:lower()
local imgid = string.format("img%05d", imagenum)
local imgpath = string.format("images/%s.%s", imgid, ext)
local mimetype = ext_to_mimetype[ext] or ""
local width = img_tag:match([[width="([^"]*)"]])
local height = img_tag:match([[height="([^"]*)"]])
-- Get higher resolution (2x) image url
local src2x = nil
local srcset = img_tag:match([[srcset="([^"]*)"]])
if srcset then
srcset = " "..srcset.. ", " -- for next pattern to possibly match 1st or last item
src2x = srcset:match([[ (%S+) 2x, ]])
if src2x and src2x:sub(1,2) == "//" then
src2x = "https:" .. src2x
end
end
cur_image = {
imgid = imgid,
imgpath = imgpath,
src = src,
src2x = src2x,
mimetype = mimetype,
width = width,
height = height,
}
table.insert(images, cur_image)
seen_images[src] = cur_image
-- Use first image of reasonable size (not an icon) and portrait-like as cover-image
if cover_imgid == "" and tonumber(width) > 50 and tonumber(height) > 50 and tonumber(height) > tonumber(width) then
cover_imgid = imgid
end
imagenum = imagenum + 1
end
-- crengine will NOT use width and height attributes, but it will use
-- those found in a style attribute.
-- If we get src2x images, crengine will scale them down to the 1x image size
-- (less space wasted by images while reading), but the 2x quality will be
-- there when image is viewed full screen with ImageViewer widget.
return string.format([[<img src="%s" style="width: %spx; height: %spx" alt=""/>]], cur_image.imgpath, cur_image.width, cur_image.height)
end
html = html:gsub("(<%s*img [^>]*>)", processImg)
logger.dbg("Images found in html:", images)
-- See what to do with images
local include_images = false
local use_img_2x = false
if with_images then
-- if no progress_callback (non UI), our fake one will return true
if #images > 0 then
include_images = progress_callback(T(_("Page contains %1 images.\nWould you like to download and include them in epub ?"), #images), true)
if include_images then
use_img_2x = progress_callback(_("Would you like to get slightly higher quality images (but bigger file size) ?"), true)
end
else
progress_callback(_("Page contains no image."))
util.sleep(1) -- Let the user see that
end
end
if not include_images then
-- Remove img tags to avoid little blank squares of missing images
html = html:gsub("<%s*img [^>]*>", "")
-- We could remove the whole image container <div class="thumb"...> ,
-- but it's a lot of nested <div> and not easy to do.
-- So the user will see the image legends and know a bit about
-- the images he chose to not get.
end
-- Open the zip file (with .tmp for now, as crengine may still
-- have a handle to the final epub_path, and we don't want to
-- delete a good one if we fail/cancel later)
local epub_path_tmp = epub_path .. ".tmp"
local ZipWriter = require("ffi/zipwriter")
local epub = ZipWriter:new{}
if not epub:open(epub_path_tmp) then
return false
end
-- We now create and add all the required epub files
-- ----------------------------------------------------------------
-- /mimetype : always "application/epub+zip"
epub:add("mimetype", "application/epub+zip")
-- ----------------------------------------------------------------
-- /META-INF/container.xml : always the same content
epub:add("META-INF/container.xml", [[
<?xml version="1.0"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>]])
-- ----------------------------------------------------------------
-- OEBPS/content.opf : metadata + list of other files (paths relative to OEBPS/ directory)
-- Other possible items in this file that are of no interest to crengine :
-- In <manifest> :
-- <item id="cover" href="title.html" media-type="application/xhtml+xml"/>
-- <item id="cover-image" href="images/cover.png" media-type="image/png"/>
-- (crengine only uses <meta name="cover" content="cover-image" /> to get the cover image)
-- In <spine toc="ncx"> :
-- <itemref idref="cover" linear="no"/>
-- And a <guide> section :
-- <guide>
-- <reference href="title.html" type="cover" title="Cover"/>
-- <reference href="toc.html" type="toc" title="Table of Contents" href="toc.html" />
-- </guide>
local koreader_version = "KOReader"
if lfs.attributes("git-rev", "mode") == "file" then
koreader_version = "KOReader "..io.open("git-rev", "r"):read()
end
local content_opf_parts = {}
-- head
table.insert(content_opf_parts, string.format([[
<?xml version='1.0' encoding='utf-8'?>
<package xmlns="http://www.idpf.org/2007/opf"
xmlns:dc="http://purl.org/dc/elements/1.1/"
unique-identifier="bookid" version="2.0">
<metadata>
<dc:title>%s</dc:title>
<dc:creator>Wikipedia %s</dc:creator>
<dc:identifier id="bookid">%s</dc:identifier>
<dc:language>%s</dc:language>
<dc:publisher>%s</dc:publisher>
<meta name="cover" content="%s"/>
</metadata>
<manifest>
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
<item id="content" href="content.html" media-type="application/xhtml+xml"/>
<item id="css" href="stylesheet.css" media-type="text/css"/>
]], page_cleaned, lang:upper(), bookid, lang, koreader_version, cover_imgid))
-- images files
if include_images then
for inum, img in ipairs(images) do
table.insert(content_opf_parts, string.format([[ <item id="%s" href="%s" media-type="%s"/>%s]], img.imgid, img.imgpath, img.mimetype, "\n"))
end
end
-- tail
table.insert(content_opf_parts, [[
</manifest>
<spine toc="ncx">
<itemref idref="content"/>
</spine>
</package>
]])
epub:add("OEBPS/content.opf", table.concat(content_opf_parts))
-- ----------------------------------------------------------------
-- OEBPS/stylesheet.css
-- crengine will use its own data/epub.css, we just add/fix a few styles
-- to look more alike wikipedia web pages (that the user can ignore
-- with "Embedded Style" off)
epub:add("OEBPS/stylesheet.css", [[
/* make section headers looks left aligned and avoid some page breaks */
h1, h2 {
text-align: left;
}
h3, h4, h5, h6, h7 {
page-break-before: avoid;
page-break-after: avoid;
text-align: left;
}
/* avoid page breaks around our centered titles on first page */
h1.koreaderwikifrontpage, h5.koreaderwikifrontpage {
page-break-before: avoid;
page-break-inside: avoid;
page-break-after: avoid;
text-align: center;
margin-top: 0em;
}
p.koreaderwikifrontpage {
font-style: italic;
font-size: 90%;
margin-left: 2em;
margin-right: 2em;
margin-top: 1em;
margin-bottom: 1em;
}
hr.koreaderwikifrontpage {
margin-left: 20%;
margin-right: 20%;
margin-bottom: 1.2em;
}
/* So many links, make them look like normal text except for underline */
a {
display:inline;
text-decoration: underline;
color: black,
font-weight: normal;
}
/* No underline for links without their href that we removed */
a.newwikinonexistent {
text-decoration: none;
}
/* show a box around image thumbnails */
div.thumb {
width: 80%;
border: dotted 1px black;
margin-top: 0.5em;
margin-bottom: 0.5em;
margin-left: 2.5em;
margin-right: 2.5em;
padding-top: ]].. (include_images and "0.5em" or "0.15em") .. [[;
padding-bottom: 0.2em;
padding-left: 0.5em;
padding-right: 0.5em;
text-align: center;
font-size: 90%;
}
/* don't waste left margin for notes and list of pages */
ul, ol {
margin-left: 0em;
}
/* helps crengine to not display them as block elements */
time, abbr, sup {
display: inline;
}
]])
-- ----------------------------------------------------------------
-- OEBPS/toc.ncx : table of content
local toc_ncx_parts = {}
local depth = 0
local cur_level = 0
local np_end = [[</navPoint>]]
local num = 1
-- Add our own first section for first page, with page name as title
table.insert(toc_ncx_parts, string.format([[<navPoint id="navpoint-%s" playOrder="%s"><navLabel><text>%s</text></navLabel><content src="content.html"/>]], num, num, page_cleaned))
table.insert(toc_ncx_parts, np_end)
-- Wikipedia sections items seem to be already sorted by index, so no need to sort
for isec, s in ipairs(sections) do
num = num + 1
local s_anchor = s.anchor
local s_title = string.format("%s %s", s.number, s.line)
s_title = (s_title:gsub("(%b<>)", "")) -- titles may include <i> and other html tags
local s_level = s.toclevel
if s_level > depth then
depth = s_level -- max depth required in toc.ncx
end
if s_level == cur_level then
table.insert(toc_ncx_parts, np_end) -- close same-level previous navPoint
elseif s_level < cur_level then
table.insert(toc_ncx_parts, np_end) -- close same-level previous navPoint
while s_level < cur_level do -- close all in-between navPoint
table.insert(toc_ncx_parts, np_end)
cur_level = cur_level - 1
end
elseif s_level > cur_level + 1 then
-- a jump from level N to level N+2 or more ... should not happen
-- per epub spec, but we don't know about wikipedia...
-- so we create missing intermediate navPoints with same anchor as current section
while s_level > cur_level + 1 do
table.insert(toc_ncx_parts, "\n"..(" "):rep(cur_level))
table.insert(toc_ncx_parts, string.format([[<navPoint id="navpoint-%s" playOrder="%s"><navLabel><text>-</text></navLabel><content src="content.html#%s"/>]], num, num, s_anchor))
cur_level = cur_level + 1
num = num + 1
end
-- elseif s_level == cur_level + 1 then
-- sublevel, nothing to close, nothing to add
end
cur_level = s_level
table.insert(toc_ncx_parts, "\n"..(" "):rep(cur_level)) -- indentation, in case a person looks at it
table.insert(toc_ncx_parts, string.format([[<navPoint id="navpoint-%s" playOrder="%s"><navLabel><text>%s</text></navLabel><content src="content.html#%s"/>]], num, num, s_title, s_anchor))
end
-- close nested <navPoint>
while cur_level > 0 do
table.insert(toc_ncx_parts, np_end)
cur_level = cur_level - 1
end
-- Prepend NCX head
table.insert(toc_ncx_parts, 1, string.format([[
<?xml version='1.0' encoding='utf-8'?>
<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
<head>
<meta name="dtb:uid" content="%s"/>
<meta name="dtb:depth" content="%s"/>
<meta name="dtb:totalPageCount" content="0"/>
<meta name="dtb:maxPageNumber" content="0"/>
</head>
<docTitle>
<text>%s</text>
</docTitle>
<navMap>
]], bookid, depth, page_cleaned))
-- Append NCX tail
table.insert(toc_ncx_parts, [[
</navMap>
</ncx>
]])
epub:add("OEBPS/toc.ncx", table.concat(toc_ncx_parts))
-- ----------------------------------------------------------------
-- OEBPS/content.html
-- Some small fixes to Wikipedia HTML to make crengine and the user happier
-- Most images are in a link to the image info page, which is a useless
-- external link for us, so let's remove this link.
html = html:gsub("<a[^>]*>%s*(<%s*img [^>]*>)%s*</a>", "%1")
-- For some <div class="thumb tright"> , which include nested divs, although
-- perfectly balanced, crengine seems to miss some closing </div> and we
-- end up having our image bordered box including the remaining main wiki text.
-- It looks like this code is supposed to deal with class= containing multiple
-- class names :
-- https://github.com/koreader/crengine/commit/0930ec7230e720c148fd6f231d69558832b4d53a
-- and that it may stumble on some cases.
-- It's all perfectly fine if we make all these div with a single class name
-- html = html:gsub([[<div class="thumb [^"]*">]], [[<div class="thumb">]])
--
-- But we may as well make all class= have a single name to avoid other problems
-- (no real risk with that, as we don't define any style for wikipedia class names,
-- except div.thumb that always appears first).
html = html:gsub([[(<[^>]* class="[^ "]+)%s+[^"]*"]], [[%1"]])
-- crengine seems to consider unknown tag as 'block' elements, so we may
-- want to remove or replace those that should be considered 'inline' elements
html = html:gsub("</?time[^>]*>", "")
-- Fix internal wikipedia links with full server url (including lang) so
-- ReaderLink can notice them and deal with them with a LookupWikipedia event.
local wiki_base_url = self:getWikiServer(lang)
-- html = html:gsub([[href="/wiki/]], [[href="]]..wiki_base_url..[[/wiki/]])
--
-- Also, crengine deals strangely with percent encoded utf8 :
-- if the link in the html is : <a href="http://fr.wikipedia.org/wiki/Fran%C3%A7oix">
-- we get from credocument:getLinkFromPosition() : http://fr.wikipedia.org/wiki/Françoix
-- These are bytes "\xc3\x83\xc2\xa7", that is U+C3 and U+A7 encoded as UTF8,
-- when we should have get "\xc3\xa7" ...
-- We can avoid that by putting in the url plain unencoded UTF8
local hex_to_char = function(x) return string.char(tonumber(x, 16)) end
local fixEncodedWikiPageTitle = function(wiki_page)
wiki_page = wiki_page:gsub("%%(%x%x)", hex_to_char)
return string.format([[href="%s/wiki/%s"]], wiki_base_url, wiki_page)
end
html = html:gsub([[href="/wiki/([^"]*)"]], fixEncodedWikiPageTitle)
-- Remove href from links to non existant wiki page so they are not clickable :
-- <a href="/w/index.php?title=PageTitle&amp;action=edit&amp;redlink=1" class="new" title="PageTitle">PageTitle©on</a>
-- (removal of the href="" will make them non clickable)
html = html:gsub([[<a[^>]* class="new"[^>]*>]], [[<a class="newwikinonexistent">]])
-- Fix some other protocol-less links to wikipedia (href="//fr.wikipedia.org/w/index.php..)
html = html:gsub([[href="//]], [[href="https://]])
-- crengine does not return link if multiple class names in <a> (<a class="external text" href="">)
-- it would be no problem as we can't follow them, but when the user tap
-- on it, the tap is propagated to other widgets and page change happen...
-- html = html:gsub([[<a rel="nofollow" class="external text"]], [[<a rel="nofollow" class="externaltext"]])
-- html = html:gsub([[<a class="external text"]], [[<a class="externaltext"]])
-- Solved by our multiple class names suppression above
-- Avoid link being clickable before <a> (if it starts a line) or after </a> (if it
-- ends a line or a block) by wrapping it with U+200B ZERO WIDTH SPACE which will
-- make the DOM tree walking code to find a link stop at it.
-- html = html:gsub("(<[aA])", "\xE2\x80\x8B%1")
-- html = html:gsub("(</[aA]>)", "%1\xE2\x80\x8B")
-- Fixed in crengine lvtinydom.
if self.wiki_prettify then
-- Prepend some symbols to section titles for a better visual feeling of hierarchy
html = html:gsub("<h1>", "<h1> "..h1_sym.." ")
html = html:gsub("<h2>", "<h2> "..h2_sym.." ")
html = html:gsub("<h3>", "<h3> "..h3_sym.." ")
html = html:gsub("<h4>", "<h4> "..h4_sym.." ")
html = html:gsub("<h5>", "<h5> "..h5_sym.." ")
html = html:gsub("<h6>", "<h6> "..h6_sym.." ")
end
-- Note: in all the gsub patterns above, we used lowercase for tags and attributes
-- because it's how they are in wikipedia HTML and it makes the pattern simple.
-- If one day this changes, they'll have to be replaced with href => [Hh][Rr][Ee][Ff] ...
-- We can finally build the final HTML with some header of our own
local saved_on = T(_("Saved on %1"), os.date("%b %d, %Y %H:%M:%S"))
local online_version_htmllink = string.format([[<a href="%s/wiki/%s">%s</a>]], wiki_base_url, page:gsub(" ", "_"), _("online version"))
local see_online_version = T(_("See %1 for up-to-date content"), online_version_htmllink)
epub:add("OEBPS/content.html", string.format([[
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>%s</title>
<link type="text/css" rel="stylesheet" href="stylesheet.css"/>
</head>
<body>
<h1 class="koreaderwikifrontpage">%s</h1>
<h5 class="koreaderwikifrontpage">Wikipedia %s</h5>
<p class="koreaderwikifrontpage">%s<br/>%s</p>
<hr class="koreaderwikifrontpage"/>
%s
</body>
</html>
]], page_cleaned, page_htmltitle, lang:upper(), saved_on, see_online_version, html))
-- ----------------------------------------------------------------
-- OEBPS/images/*
if include_images then
local nb_images = #images
for inum, img in ipairs(images) do
progress_callback(T(_("Fetching image %1 / %2 ..."), inum, nb_images))
local src = img.src
if use_img_2x and img.src2x then
src = img.src2x
end
logger.dbg("Getting img ", src)
local success, content = getUrlContent(src)
-- success, content = getUrlContent(src..".unexistant") -- to simulate failure
if success then
logger.dbg("success, size:", #content)
else
logger.info("failed fetching:", src)
end
if success then
epub:add("OEBPS/"..img.imgpath, content)
else
local go_on = progress_callback(T(_("Failed getting image %1, continue anyway ?"), inum), true)
if not go_on then
cancelled = true
break
end
end
end
end
-- Done with adding files
if cancelled then
progress_callback(_("Cleaning up..."))
else
progress_callback(_("Packing epub..."))
end
epub:close()
-- This was nearly a no-op, so sleep a bit to make that progress step seen
util.usleep(300000)
progress_callback() -- close last progress info
if cancelled then
-- Build was cancelled, remove half created .epub
if lfs.attributes(epub_path_tmp, "mode") == "file" then
os.remove(epub_path_tmp)
end
return false
end
-- Finally move the .tmp to the final file
os.rename(epub_path_tmp, epub_path)
logger.info("successfully created:", epub_path)
return true
end
-- Wrapper to Wikipedia:createEpub() with UI progress info
function Wikipedia:createEpubWithUI(epub_path, page, lang, result_callback)
-- For progress_callback to be able to wait when needed
-- for user confirmation, we need to wrap Wikipedia:createEpub
-- in a coroutine, that can be resumed by these confirm callbacks.
local UIManager = require("ui/uimanager")
local InfoMessage = require("ui/widget/infomessage")
local ConfirmBox = require("ui/widget/confirmbox")
-- Visual progress callback
local cur_progress_box = nil
local function ui_progress_callback(text, confirmbox)
if cur_progress_box then
-- close previous progress info
UIManager:close(cur_progress_box)
-- no repaint here, we'll do that below when new stuff is shown
end
if not text then
-- no text given, used to just close previous progress info when done
-- a repaint is needed
UIManager:forceRePaint()
return true
end
if confirmbox then
-- ConfirmBox requested: callbacks will resume coroutine
local _coroutine = coroutine.running()
cur_progress_box = ConfirmBox:new{
text = text,
ok_callback = function()
coroutine.resume(_coroutine, true)
end,
cancel_callback = function()
coroutine.resume(_coroutine, false)
end,
}
else
-- simple InfoMessage requested
cur_progress_box = InfoMessage:new{text = text}
end
logger.dbg("Showing", confirmbox and "ConfirmBox" or "InfoMessage", text)
UIManager:show(cur_progress_box)
UIManager:forceRePaint()
if not confirmbox then
return true -- nothing more to do
end
-- we need to wait for ConfirmBox callback
logger.dbg("waiting for coroutine to resume")
if coroutine.running() then
local result = coroutine.yield()
logger.dbg(" coroutine ran and returned", result)
return result
end
end
-- Coroutine wrapping Wikipedia:createEpub()
local co = coroutine.create(function()
-- If errors in Wikipedia:createEpub(), the coroutine
-- would just abort without crashing the reader, so
-- pcall would not be needed. But if that happens,
-- pcall will let us know and returns the error,
-- that we can log.
local ok, success = pcall(self.createEpub, self, epub_path, page, lang, true, ui_progress_callback)
if ok and success then
result_callback(true)
else
ui_progress_callback() -- close any last progress info not cleaned
logger.warn("Wikipedia.createEpub pcall:", ok, success)
result_callback(false)
end
end)
-- Execute coroutine
coroutine.resume(co)
end
return Wikipedia

Loading…
Cancel
Save