local Version = require("version") local ffiutil = require("ffi/util") local http = require("socket.http") local logger = require("logger") local ltn12 = require("ltn12") local socket = require("socket") local socket_url = require("socket.url") local socketutil = require("socketutil") local _ = require("gettext") local T = ffiutil.template local EpubDownloadBackend = { -- Can be set so HTTP requests will be done under Trapper and -- be interruptible trap_widget = nil, -- For actions done with Trapper:dismissable methods, we may throw -- and error() with this code. We make the value of this error -- accessible here so that caller can know it's a user dismiss. dismissed_error_code = "Interrupted by user", } local max_redirects = 5; --prevent infinite redirects -- filter HTML using CSS selector local function filter(text, element) local htmlparser = require("htmlparser") local root = htmlparser.parse(text, 5000) local filtered = nil local selectors = { "main", "article", "div#main", "#main-article", ".main-content", "#body", "#content", ".content", "div#article", "div.article", "div.post", "div.post-outer", ".l-root", ".content-container", ".StandardArticleBody_body", "div#article-inner", "div#newsstorytext", "div.general", } if element then table.insert(selectors, 1, element) end for _, sel in ipairs(selectors) do local elements = root:select(sel) if elements then for _, e in ipairs(elements) do filtered = e:getcontent() if filtered then break end end if filtered then break end end end if not filtered then return text end return "" .. filtered .. "" end -- Get URL content local function getUrlContent(url, timeout, maxtime, redirectCount) logger.dbg("getUrlContent(", url, ",", timeout, ",", maxtime, ",", redirectCount, ")") if not redirectCount then redirectCount = 0 elseif redirectCount == max_redirects then error("EpubDownloadBackend: reached max redirects: ", redirectCount) end if not timeout then timeout = 10 end logger.dbg("timeout:", timeout) local sink = {} local parsed = socket_url.parse(url) socketutil:set_timeout(timeout, maxtime or 30) local request = { url = url, method = "GET", sink = maxtime and socketutil.table_sink(sink) or ltn12.sink.table(sink), } logger.dbg("request:", request) local code, headers, status = socket.skip(1, http.request(request)) socketutil:reset_timeout() logger.dbg("After http.request") local content = table.concat(sink) -- empty or content accumulated till now logger.dbg("type(code):", type(code)) logger.dbg("code:", code) logger.dbg("headers:", headers) logger.dbg("status:", status) logger.dbg("#content:", #content) if code == socketutil.TIMEOUT_CODE or code == socketutil.SSL_HANDSHAKE_CODE or code == socketutil.SINK_TIMEOUT_CODE then logger.warn("request interrupted:", code) return false, code end if headers == nil then logger.warn("No HTTP headers:", code, status) return false, "Network or remote server unavailable" end if not code or string.sub(code, 1, 1) ~= "2" then -- all 200..299 HTTP codes are OK if code and code > 299 and code < 400 and headers and headers.location then -- handle 301, 302... local redirected_url = headers.location local parsed_redirect_location = socket_url.parse(redirected_url) if not parsed_redirect_location.host then parsed_redirect_location.host = parsed.host parsed_redirect_location.scheme = parsed.scheme redirected_url = socket_url.build(parsed_redirect_location) end logger.dbg("getUrlContent: Redirecting to url: ", redirected_url) return getUrlContent(redirected_url, timeout, maxtime, redirectCount + 1) else error("EpubDownloadBackend: Don't know how to handle HTTP response status: ", status) end logger.warn("HTTP status not okay:", code, status) return false, "Remote server error or unavailable" end if headers and headers["content-length"] then -- Check we really got the announced content size local content_length = tonumber(headers["content-length"]) if #content ~= content_length then return false, "Incomplete content received" end end logger.dbg("Returning content ok") return true, content end function EpubDownloadBackend:getResponseAsString(url) logger.dbg("EpubDownloadBackend:getResponseAsString(", url, ")") local success, content = getUrlContent(url) if (success) then return content else error("Failed to download content for url:", url) end end function EpubDownloadBackend:setTrapWidget(trap_widget) self.trap_widget = trap_widget end function EpubDownloadBackend:resetTrapWidget() self.trap_widget = nil end function EpubDownloadBackend:loadPage(url) local completed, success, content if self.trap_widget then -- if previously set with EpubDownloadBackend:setTrapWidget() local Trapper = require("ui/trapper") local timeout, maxtime = 30, 60 -- We use dismissableRunInSubprocess with complex return values: completed, success, content = Trapper:dismissableRunInSubprocess(function() return getUrlContent(url, timeout, maxtime) end, self.trap_widget) if not completed then error(self.dismissed_error_code) -- "Interrupted by user" end else local timeout, maxtime = 10, 60 success, content = getUrlContent(url, timeout, maxtime) end logger.dbg("success:", success, "type(content):", type(content), "content:", content:sub(1, 500), "...") if not success then error(content) else return content end end local ext_to_mimetype = { png = "image/png", jpg = "image/jpeg", jpeg = "image/jpeg", gif = "image/gif", svg = "image/svg+xml", html= "application/xhtml+xml", xhtml= "application/xhtml+xml", ncx = "application/x-dtbncx+xml", js = "text/javascript", css = "text/css", otf = "application/opentype", ttf = "application/truetype", woff = "application/font-woff", } -- Create an epub file (with possibly images) function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, message, filter_enable, filter_element) logger.dbg("EpubDownloadBackend:createEpub(", epub_path, ")") -- Use Trapper to display progress and ask questions through the UI. -- We need to have been Trapper.wrap()'ed for UI to be used, otherwise -- Trapper:info() and Trapper:confirm() will just use logger. local UI = require("ui/trapper") -- We may need to build absolute urls for non-absolute links and images urls local base_url = socket_url.parse(url) local cancelled = false local page_htmltitle = html:match([[(.*)]]) logger.dbg("page_htmltitle is ", page_htmltitle) -- local sections = html.sections -- Wikipedia provided TOC local bookid = "bookid_placeholder" --string.format("wikipedia_%s_%s_%s", lang, phtml.pageid, phtml.revid) -- Not sure if this bookid may ever be used by indexing software/calibre, but if it is, -- should it changes if content is updated (as now, including the wikipedia revisionId), -- or should it stays the same even if revid changes (content of the same book updated). if filter_enable then html = filter(html, filter_element) end local images = {} local seen_images = {} local imagenum = 1 local cover_imgid = nil -- best candidate for cover among our images local processImg = function(img_tag) local src = img_tag:match([[src="([^"]*)"]]) if src == nil or src == "" then logger.dbg("no src found in ", img_tag) return nil end if src:sub(1,2) == "//" then src = "https:" .. src -- Wikipedia redirects from http to https, so use https elseif src:sub(1,1) == "/" then -- non absolute url src = socket_url.absolute(base_url, src) end local cur_image if seen_images[src] then -- already seen cur_image = seen_images[src] else local src_ext = src if src_ext:find("?") then -- "/w/extensions/wikihiero/img/hiero_D22.png?0b8f1" src_ext = src_ext:match("(.-)%?") -- remove ?blah end local ext = src_ext:match(".*%.(%S%S%S?%S?%S?)$") -- extensions are only 2 to 5 chars if ext == nil or ext == "" then -- we won't know what mimetype to use, ignore it logger.dbg("no file extension found in ", src) return nil end ext = ext:lower() local imgid = string.format("img%05d", imagenum) local imgpath = string.format("images/%s.%s", imgid, ext) local mimetype = ext_to_mimetype[ext] or "" local width = tonumber(img_tag:match([[width="([^"]*)"]])) local height = tonumber(img_tag:match([[height="([^"]*)"]])) -- Get higher resolution (2x) image url local src2x = nil local srcset = img_tag:match([[srcset="([^"]*)"]]) if srcset then srcset = " "..srcset.. ", " -- for next pattern to possibly match 1st or last item src2x = srcset:match([[ (%S+) 2x, ]]) if src2x then if src2x:sub(1,2) == "//" then src2x = "https:" .. src2x elseif src2x:sub(1,1) == "/" then -- non absolute url src2x = socket_url.absolute(base_url, src2x) end end end cur_image = { imgid = imgid, imgpath = imgpath, src = src, src2x = src2x, mimetype = mimetype, width = width, height = height, } table.insert(images, cur_image) seen_images[src] = cur_image -- Use first image of reasonable size (not an icon) and portrait-like as cover-image if not cover_imgid and width and width > 50 and height and height > 50 and height > width then logger.dbg("Found a suitable cover image") cover_imgid = imgid end imagenum = imagenum + 1 end -- crengine will NOT use width and height attributes, but it will use -- those found in a style attribute. -- If we get src2x images, crengine will scale them down to the 1x image size -- (less space wasted by images while reading), but the 2x quality will be -- there when image is viewed full screen with ImageViewer widget. local style_props = {} if cur_image.width then table.insert(style_props, string.format("width: %spx", cur_image.width)) end if cur_image.height then table.insert(style_props, string.format("height: %spx", cur_image.height)) end local style = table.concat(style_props, "; ") return string.format([[]], cur_image.imgpath, style) end html = html:gsub("(<%s*img [^>]*>)", processImg) logger.dbg("Images found in html:", images) -- See what to do with images local use_img_2x = false if not include_images then -- Remove img tags to avoid little blank squares of missing images html = html:gsub("<%s*img [^>]*>", "") -- We could remove the whole image container
, -- but it's a lot of nested
and not easy to do. -- So the user will see the image legends and know a bit about -- the images he chose to not get. end UI:info(T(_("%1\n\nBuilding EPUB…"), message)) -- Open the zip file (with .tmp for now, as crengine may still -- have a handle to the final epub_path, and we don't want to -- delete a good one if we fail/cancel later) local epub_path_tmp = epub_path .. ".tmp" local ZipWriter = require("ffi/zipwriter") local epub = ZipWriter:new{} if not epub:open(epub_path_tmp) then logger.dbg("Failed to open epub_path_tmp") return false end -- We now create and add all the required epub files -- ---------------------------------------------------------------- -- /mimetype : always "application/epub+zip" epub:add("mimetype", "application/epub+zip") -- ---------------------------------------------------------------- -- /META-INF/container.xml : always the same content epub:add("META-INF/container.xml", [[ ]]) logger.dbg("Added META-INF/container.xml") -- ---------------------------------------------------------------- -- OEBPS/content.opf : metadata + list of other files (paths relative to OEBPS/ directory) -- Other possible items in this file that are of no interest to crengine : -- In : -- -- -- (crengine only uses to get the cover image) -- In : -- -- And a section : -- -- -- -- local content_opf_parts = {} -- head local meta_cover = "" if include_images and cover_imgid then meta_cover = string.format([[]], cover_imgid) end logger.dbg("meta_cover:", meta_cover) table.insert(content_opf_parts, string.format([[ %s KOReader %s %s ]], page_htmltitle, Version:getCurrentRevision(), meta_cover)) -- images files if include_images then for inum, img in ipairs(images) do table.insert(content_opf_parts, string.format([[ %s]], img.imgid, img.imgpath, img.mimetype, "\n")) end end -- tail table.insert(content_opf_parts, [[ ]]) epub:add("OEBPS/content.opf", table.concat(content_opf_parts)) logger.dbg("Added OEBPS/content.opf") -- ---------------------------------------------------------------- -- OEBPS/stylesheet.css --- @todo We told it we'd include a stylesheet.css, so it's probably best -- that we do. In theory, we could try to fetch any *.css files linked in -- the main html. epub:add("OEBPS/stylesheet.css", [[ /* Empty */ ]]) logger.dbg("Added OEBPS/stylesheet.css") -- ---------------------------------------------------------------- -- OEBPS/toc.ncx : table of content local toc_ncx_parts = {} local depth = 0 local cur_level = 0 local np_end = [[]] local num = 1 -- Add our own first section for first page, with page name as title table.insert(toc_ncx_parts, string.format([[%s]], num, num, page_htmltitle)) table.insert(toc_ncx_parts, np_end) --- @todo Not essential for most articles, but longer articles might benefit -- from parsing tags and constructing a proper TOC while cur_level > 0 do table.insert(toc_ncx_parts, np_end) cur_level = cur_level - 1 end -- Prepend NCX head table.insert(toc_ncx_parts, 1, string.format([[ %s ]], bookid, depth, page_htmltitle)) -- Append NCX tail table.insert(toc_ncx_parts, [[ ]]) epub:add("OEBPS/toc.ncx", table.concat(toc_ncx_parts)) logger.dbg("Added OEBPS/toc.ncx") -- ---------------------------------------------------------------- -- OEBPS/content.html epub:add("OEBPS/content.html", html) logger.dbg("Added OEBPS/content.html") -- Force a GC to free the memory we used till now (the second call may -- help reclaim more memory). collectgarbage() collectgarbage() -- ---------------------------------------------------------------- -- OEBPS/images/* if include_images then local nb_images = #images for inum, img in ipairs(images) do -- Process can be interrupted at this point between each image download -- by tapping while the InfoMessage is displayed -- We use the fast_refresh option from image #2 for a quicker download local go_on = UI:info(T(_("%1\n\nRetrieving image %2 / %3 …"), message, inum, nb_images), inum >= 2) if not go_on then logger.dbg("cancelled") cancelled = true break end local src = img.src if use_img_2x and img.src2x then src = img.src2x end logger.dbg("Getting img ", src) local success, content = getUrlContent(src) -- success, content = getUrlContent(src..".unexistant") -- to simulate failure if success then logger.dbg("success, size:", #content) else logger.dbg("failed fetching:", src) end if success then -- Images do not need to be compressed, so spare some cpu cycles local no_compression = true if img.mimetype == "image/svg+xml" then -- except for SVG images (which are XML text) no_compression = false end epub:add("OEBPS/"..img.imgpath, content, no_compression) logger.dbg("Adding OEBPS/"..img.imgpath) else go_on = UI:confirm(T(_("Downloading image %1 failed. Continue anyway?"), inum), _("Stop"), _("Continue")) if not go_on then cancelled = true break end end end end -- Done with adding files if cancelled then if UI:confirm(_("Download did not complete.\nDo you want to create an EPUB with the already downloaded images?"), _("Don't create"), _("Create")) then cancelled = false end end if cancelled then UI:info(_("Canceled. Cleaning up…")) else UI:info(T(_("%1\n\nPacking EPUB…"), message)) end epub:close() if cancelled then -- Build was cancelled, remove half created .epub if lfs.attributes(epub_path_tmp, "mode") == "file" then os.remove(epub_path_tmp) end return false end -- Finally move the .tmp to the final file os.rename(epub_path_tmp, epub_path) logger.dbg("successfully created:", epub_path) -- Force a GC to free the memory we used (the second call may help -- reclaim more memory). collectgarbage() collectgarbage() return true end return EpubDownloadBackend