[NewsDownloader] Added an HTML filter through a CSS selector (#6228)

Fixes #6185.
reviewable/pr6236/r1
Mikolaj "lich" H 4 years ago committed by GitHub
parent 6b5d50a8af
commit b741fcebac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -24,6 +24,54 @@ local max_redirects = 5; --prevent infinite redirects
local TIMEOUT_CODE = "timeout" -- from socket.lua
local MAXTIME_CODE = "maxtime reached" -- from sink_table_with_maxtime
-- filter HTML using CSS selector
local function filter(text, element)
local htmlparser = require("htmlparser")
local root = htmlparser.parse(text, 5000)
local filtered = nil
local selectors = {
"main",
"article",
"div#main",
"#main-article",
".main-content",
"#body",
"#content",
".content",
"div#article",
"div.article",
"div.post",
"div.post-outer",
".l-root",
".content-container",
".StandardArticleBody_body",
"div#article-inner",
"div#newsstorytext",
"div.general",
}
if element then
table.insert(selectors, 1, element)
end
for _, sel in ipairs(selectors) do
local elements = root:select(sel)
if elements then
for _, e in ipairs(elements) do
filtered = e:getcontent()
if filtered then
break
end
end
if filtered then
break
end
end
end
if not filtered then
return text
end
return "<!DOCTYPE html><html><head></head><body>" .. filtered .. "</body></html>"
end
-- Sink that stores into a table, aborting if maxtime has elapsed
local function sink_table_with_maxtime(t, maxtime)
-- Start counting as soon as this sink is created
@ -181,15 +229,13 @@ local ext_to_mimetype = {
ttf = "application/truetype",
woff = "application/font-woff",
}
-- Create an epub file (with possibly images)
function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, message)
function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, message, filter_enable, filter_element)
logger.dbg("EpubDownloadBackend:createEpub(", epub_path, ")")
-- Use Trapper to display progress and ask questions through the UI.
-- We need to have been Trapper.wrap()'ed for UI to be used, otherwise
-- Trapper:info() and Trapper:confirm() will just use logger.
local UI = require("ui/trapper")
-- We may need to build absolute urls for non-absolute links and images urls
local base_url = socket_url.parse(url)
@ -201,7 +247,7 @@ function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, me
-- Not sure if this bookid may ever be used by indexing software/calibre, but if it is,
-- should it changes if content is updated (as now, including the wikipedia revisionId),
-- or should it stays the same even if revid changes (content of the same book updated).
if filter_enable then html = filter(html, filter_element) end
local images = {}
local seen_images = {}
local imagenum = 1

@ -21,12 +21,19 @@ return {--do NOT change this line
-- 'include_images=false' - means ignore any images, only download the text (faster download, smaller file sizes)
-- default value is 'false' (if no 'include_images' entry)
-- 'enable_filter=true' - means filter using a CSS selector to delimit part of the page to just that (does not apply if download_full_article=false)
-- 'enable_filter=false' - means no such filtering and including the full page
-- default value is 'false'
-- 'filter_element="name_of_css.element.class" - means to filter the chosen CSS selector, it can be easily picked using a modern web browser
-- The default value is empty. The default list of common selectors is used as fallback if this value is set.
-- comment out line ("--" at line start) to stop downloading source
-- LIST YOUR FEEDS HERE:
{ "http://feeds.reuters.com/Reuters/worldNews?format=xml", limit = 2, download_full_article=true},
{ "http://feeds.reuters.com/Reuters/worldNews?format=xml", limit = 2, download_full_article=true, include_images=true, enable_filter=true},
{ "https://www.pcworld.com/index.rss", limit = 7 , download_full_article=false},

@ -199,10 +199,12 @@ function NewsDownloader:loadConfigAndProcessFeeds()
local limit = feed.limit
local download_full_article = feed.download_full_article == nil or feed.download_full_article
local include_images = not never_download_images and feed.include_images
local enable_filter = feed.enable_filter or feed.enable_filter == nil
local filter_element = feed.filter_element or feed.filter_element == nil
if url and limit then
local feed_message = T(_("Processing %1/%2:\n%3"), idx, total_feed_entries, BD.url(url))
UI:info(feed_message)
NewsDownloader:processFeedSource(url, tonumber(limit), unsupported_feeds_urls, download_full_article, include_images, feed_message)
NewsDownloader:processFeedSource(url, tonumber(limit), unsupported_feeds_urls, download_full_article, include_images, feed_message, enable_filter, filter_element)
else
logger.warn('NewsDownloader: invalid feed config entry', feed)
end
@ -230,7 +232,7 @@ function NewsDownloader:loadConfigAndProcessFeedsWithUI()
end)
end
function NewsDownloader:processFeedSource(url, limit, unsupported_feeds_urls, download_full_article, include_images, message)
function NewsDownloader:processFeedSource(url, limit, unsupported_feeds_urls, download_full_article, include_images, message, enable_filter, filter_element)
local ok, response = pcall(function()
return DownloadBackend:getResponseAsString(url)
@ -250,11 +252,11 @@ function NewsDownloader:processFeedSource(url, limit, unsupported_feeds_urls, do
if is_atom then
ok = pcall(function()
return self:processAtom(feeds, limit, download_full_article, include_images, message)
return self:processAtom(feeds, limit, download_full_article, include_images, message, enable_filter, filter_element)
end)
elseif is_rss then
ok = pcall(function()
return self:processRSS(feeds, limit, download_full_article, include_images, message)
return self:processRSS(feeds, limit, download_full_article, include_images, message, enable_filter, filter_element)
end)
end
if not ok or (not is_rss and not is_atom) then
@ -280,7 +282,7 @@ function NewsDownloader:deserializeXMLString(xml_str)
return xmlhandler.root
end
function NewsDownloader:processAtom(feeds, limit, download_full_article, include_images, message)
function NewsDownloader:processAtom(feeds, limit, download_full_article, include_images, message, enable_filter, filter_element)
local feed_output_dir = string.format("%s%s/",
news_download_dir_path,
util.getSafeFilename(getFeedTitle(feeds.feed.title)))
@ -294,14 +296,14 @@ function NewsDownloader:processAtom(feeds, limit, download_full_article, include
end
local article_message = T(_("%1\n\nFetching article %2/%3:"), message, index, limit == 0 and #feeds.rss.channel.item or limit)
if download_full_article then
self:downloadFeed(feed, feed_output_dir, include_images, article_message)
self:downloadFeed(feed, feed_output_dir, include_images, article_message, enable_filter, filter_element)
else
self:createFromDescription(feed, feed.content[1], feed_output_dir, include_images, article_message)
end
end
end
function NewsDownloader:processRSS(feeds, limit, download_full_article, include_images, message)
function NewsDownloader:processRSS(feeds, limit, download_full_article, include_images, message, enable_filter, filter_element)
local feed_output_dir = ("%s%s/"):format(
news_download_dir_path, util.getSafeFilename(util.htmlEntitiesToUtf8(feeds.rss.channel.title)))
if not lfs.attributes(feed_output_dir, "mode") then
@ -314,7 +316,7 @@ function NewsDownloader:processRSS(feeds, limit, download_full_article, include_
end
local article_message = T(_("%1\n\nFetching article %2/%3:"), message, index, limit == 0 and #feeds.rss.channel.item or limit)
if download_full_article then
self:downloadFeed(feed, feed_output_dir, include_images, article_message)
self:downloadFeed(feed, feed_output_dir, include_images, article_message, enable_filter, filter_element)
else
self:createFromDescription(feed, feed.description, feed_output_dir, include_images, article_message)
end
@ -341,7 +343,7 @@ local function getTitleWithDate(feed)
return title
end
function NewsDownloader:downloadFeed(feed, feed_output_dir, include_images, message)
function NewsDownloader:downloadFeed(feed, feed_output_dir, include_images, message, enable_filter, filter_element)
local title_with_date = getTitleWithDate(feed)
local news_file_path = ("%s%s%s"):format(feed_output_dir,
title_with_date,
@ -355,7 +357,7 @@ function NewsDownloader:downloadFeed(feed, feed_output_dir, include_images, mess
local article_message = T(_("%1\n%2"), message, title_with_date)
local link = getFeedLink(feed.link)
local html = DownloadBackend:loadPage(link)
DownloadBackend:createEpub(news_file_path, html, link, include_images, article_message)
DownloadBackend:createEpub(news_file_path, html, link, include_images, article_message, enable_filter, filter_element)
end
end

Loading…
Cancel
Save