local JSON = require("json") local Screen = require("device").screen local ffiutil = require("ffi/util") local logger = require("logger") local util = require("util") local _ = require("gettext") local T = ffiutil.template --[[ -- Query wikipedia using Wikimedia Web API. -- https://en.wikipedia.org/w/api.php?format=jsonfm&action=query&generator=search&gsrnamespace=0&gsrsearch=ereader&gsrlimit=10&prop=extracts&exintro&explaintext&exlimit=max -- https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=jsonfm&explaintext=&redirects=&titles=E-reader -- -- To get parsed HTML : -- https://en.wikipedia.org/w/api.php?action=parse&page=E-book -- https://en.wikipedia.org/w/api.php?action=parse&page=E-book&prop=text|sections|displaytitle|revid&disablelimitreport=&disableeditsection -- https://www.mediawiki.org/wiki/API:Parsing_wikitext#parse --]] local Wikipedia = { wiki_server = "https://%s.wikipedia.org", wiki_path = "/w/api.php", default_lang = "en", -- See https://www.mediawiki.org/wiki/API:Main_page for details. -- Search query, returns introductory texts (+ main thumbnail image) wiki_search_params = { action = "query", generator = "search", gsrnamespace = "0", -- gsrsearch = nil, -- text to lookup, will be added below gsrlimit = 20, -- max nb of results to get exlimit = "max", prop = "extracts|info|pageimages", -- 'extracts' to get text, 'info' to get full page length format = "json", explaintext = "", exintro = "", -- We have to use 'exintro=' to get extracts for ALL results -- (otherwise, we get the full text for only the first result, and -- no text at all for the others }, -- Full article, parsed to output text (+ main thumbnail image) wiki_full_params = { action = "query", prop = "extracts|pageimages", format = "json", -- exintro = nil, -- get more than only the intro explaintext = "", redirects = "", -- title = nil, -- text to lookup, will be added below }, -- Full article, parsed to output HTML, for Save as EPUB wiki_phtml_params = { action = "parse", format = "json", -- we only need the following informations prop = "text|sections|displaytitle|revid", -- page = nil, -- text to lookup, will be added below -- disabletoc = "", -- if we want to remove toc IN html disablelimitreport = "", disableeditsection = "", }, -- Full article, parsed to output HTML, for images extraction -- (used with full article as text, if "show more images" enabled) wiki_images_params = { -- same as previous one, with just text html action = "parse", format = "json", -- we only need the following informations prop = "text", -- page = nil, -- text to lookup, will be added below redirects = "", disabletoc = "", -- remove toc in html disablelimitreport = "", disableeditsection = "", }, -- There is an alternative for obtaining page's images: -- prop=imageinfo&action=query&iiprop=url|dimensions|mime|extmetadata&generator=images&pageids=49448&iiurlwidth=100&iiextmetadatafilter=ImageDescription -- but it gives all images (including wikipedia icons) in any order, without -- any score or information that would help considering if they matter or not -- -- Allow for disabling prettifying full page text wiki_prettify = G_reader_settings:nilOrTrue("wikipedia_prettify"), -- Can be set so HTTP requests will be done under Trapper and -- be interruptible trap_widget = nil, -- For actions done with Trapper:dismissable methods, we may throw -- and error() with this code. We make the value of this error -- accessible here so that caller can know it's a user dismiss. dismissed_error_code = "Interrupted by user", } function Wikipedia:getWikiServer(lang) return string.format(self.wiki_server, lang or self.default_lang) end -- Codes that getUrlContent may get from requester.request() local TIMEOUT_CODE = "timeout" -- from socket.lua local MAXTIME_CODE = "maxtime reached" -- from sink_table_with_maxtime -- Sink that stores into a table, aborting if maxtime has elapsed local function sink_table_with_maxtime(t, maxtime) -- Start counting as soon as this sink is created local start_secs, start_usecs = ffiutil.gettime() local starttime = start_secs + start_usecs/1000000 t = t or {} local f = function(chunk, err) local secs, usecs = ffiutil.gettime() if secs + usecs/1000000 - starttime > maxtime then return nil, MAXTIME_CODE end if chunk then table.insert(t, chunk) end return 1 end return f, t end -- Get URL content local function getUrlContent(url, timeout, maxtime) local socket = require('socket') local ltn12 = require('ltn12') local http = require('socket.http') local https = require('ssl.https') local requester if url:sub(1,7) == "http://" then requester = http elseif url:sub(1,8) == "https://" then requester = https else return false, "Unsupported protocol" end if not timeout then timeout = 10 end -- timeout needs to be set to 'http', even if we use 'https' http.TIMEOUT, https.TIMEOUT = timeout, timeout local request = {} local sink = {} request['url'] = url request['method'] = 'GET' -- 'timeout' delay works on socket, and is triggered when -- that time has passed trying to connect, or after connection -- when no data has been read for this time. -- On a slow connection, it may not be triggered (as we could read -- 1 byte every 1 second, not triggering any timeout). -- 'maxtime' can be provided to overcome that, and we start counting -- as soon as the first content byte is received (but it is checked -- for only when data is received). -- Setting 'maxtime' and 'timeout' gives more chance to abort the request when -- it takes too much time (in the worst case: in timeout+maxtime seconds). -- But time taken by DNS lookup cannot easily be accounted for, so -- a request may (when dns lookup takes time) exceed timeout and maxtime... if maxtime then request['sink'] = sink_table_with_maxtime(sink, maxtime) else request['sink'] = ltn12.sink.table(sink) end local code, headers, status = socket.skip(1, requester.request(request)) local content = table.concat(sink) -- empty or content accumulated till now -- logger.dbg("code:", code) -- logger.dbg("headers:", headers) -- logger.dbg("status:", status) -- logger.dbg("#content:", #content) if code == TIMEOUT_CODE or code == MAXTIME_CODE then logger.warn("request interrupted:", code) return false, code end if headers == nil then logger.warn("No HTTP headers:", code, status) return false, "Network or remote server unavailable" end if not code or string.sub(code, 1, 1) ~= "2" then -- all 200..299 HTTP codes are OK logger.warn("HTTP status not okay:", code, status) return false, "Remote server error or unavailable" end if headers and headers["content-length"] then -- Check we really got the announced content size local content_length = tonumber(headers["content-length"]) if #content ~= content_length then return false, "Incomplete content received" end end return true, content end function Wikipedia:setTrapWidget(trap_widget) self.trap_widget = trap_widget end function Wikipedia:resetTrapWidget() self.trap_widget = nil end -- Possible values for page_type parameter to loadPage() local WIKIPEDIA_INTRO = 1 local WIKIPEDIA_FULL = 2 local WIKIPEDIA_PHTML = 3 local WIKIPEDIA_IMAGES = 4 --[[ -- return decoded JSON table from Wikipedia --]] function Wikipedia:loadPage(text, lang, page_type, plain) local url = require('socket.url') local query = "" local parsed = url.parse(self:getWikiServer(lang)) parsed.path = self.wiki_path if page_type == WIKIPEDIA_INTRO then -- search query self.wiki_search_params.explaintext = plain and "" or nil for k,v in pairs(self.wiki_search_params) do query = string.format("%s%s=%s&", query, k, v) end parsed.query = query .. "gsrsearch=" .. url.escape(text) elseif page_type == WIKIPEDIA_FULL then -- full page content self.wiki_full_params.explaintext = plain and "" or nil for k,v in pairs(self.wiki_full_params) do query = string.format("%s%s=%s&", query, k, v) end parsed.query = query .. "titles=" .. url.escape(text) elseif page_type == WIKIPEDIA_PHTML then -- parsed html page content for k,v in pairs(self.wiki_phtml_params) do query = string.format("%s%s=%s&", query, k, v) end parsed.query = query .. "page=" .. url.escape(text) elseif page_type == WIKIPEDIA_IMAGES then -- images found in page html for k,v in pairs(self.wiki_images_params) do query = string.format("%s%s=%s&", query, k, v) end parsed.query = query .. "page=" .. url.escape(text) else return end local built_url = url.build(parsed) local completed, success, content if self.trap_widget then -- if previously set with Wikipedia:setTrapWidget() local Trapper = require("ui/trapper") local timeout, maxtime = 30, 60 -- We use dismissableRunInSubprocess with complex return values: completed, success, content = Trapper:dismissableRunInSubprocess(function() return getUrlContent(built_url, timeout, maxtime) end, self.trap_widget) if not completed then error(self.dismissed_error_code) -- "Interrupted by user" end else local timeout, maxtime = 10, 60 success, content = getUrlContent(built_url, timeout, maxtime) end if not success then error(content) end if content ~= "" and string.sub(content, 1,1) == "{" then local ok, result = pcall(JSON.decode, content) if ok and result then logger.dbg("wiki result json:", result) return result else logger.warn("wiki result json decoding error:", result) error("Failed decoding JSON") end else logger.warn("wiki response is not json:", content) error("Response is not JSON") end end -- search wikipedia and get intros for results function Wikipedia:searchAndGetIntros(text, lang) local result = self:loadPage(text, lang, WIKIPEDIA_INTRO, true) if result then local query = result.query if query then local show_image = G_reader_settings:nilOrTrue("wikipedia_show_image") -- Scale wikipedia normalized (we hope) thumbnail by 2 (adjusted -- to screen size/dpi) for intros (and x8 more for highres image) local image_size_factor = Screen:scaleBySize(200)/100.0 if show_image then for pageid, page in pairs(query.pages) do self:addImages(page, lang, false, image_size_factor, 8) end end return query.pages end end end -- get full content of a wiki page function Wikipedia:getFullPage(wiki_title, lang) local result = self:loadPage(wiki_title, lang, WIKIPEDIA_FULL, true) if result then local query = result.query if query then local show_image = G_reader_settings:nilOrTrue("wikipedia_show_image") local show_more_images = G_reader_settings:nilOrTrue("wikipedia_show_more_images") -- Scale wikipedia normalized (we hope) thumbnails by 4 (adjusted -- to screen size/dpi) for full page (and this *4 for highres image) local image_size_factor = Screen:scaleBySize(400)/100.0 if self.wiki_prettify or show_image then for pageid, page in pairs(query.pages) do if self.wiki_prettify and page.extract then -- Prettification of the plain text full page page.extract = self:prettifyText(page.extract) end if show_image then self:addImages(page, lang, show_more_images, image_size_factor, 4) end end end return query.pages end end end -- get parsed html content and other infos of a wiki page function Wikipedia:getFullPageHtml(wiki_title, lang) local result = self:loadPage(wiki_title, lang, WIKIPEDIA_PHTML, true) if result and result.parse then return result.parse end if result.error and result.error.info then error(result.error.info) end end -- get images extracted from parsed html function Wikipedia:getFullPageImages(wiki_title, lang) local images = {} -- will be returned, each in a format similar to page.thumbnail local result = self:loadPage(wiki_title, lang, WIKIPEDIA_IMAGES, true) if result and result.parse and result.parse.text and result.parse.text["*"] then local html = result.parse.text["*"] -- html content local url = require('socket.url') local wiki_base_url = self:getWikiServer(lang) local thumbs = {} -- bits of HTML containing an image -- We first try to catch images in
%s
%s