calibre plugin: handle huge metadata files (#7159)

metadata files > 30MB will be parsed manually to prevent OOM
pull/7202/head v2021.01
Martín Fernández 3 years ago committed by GitHub
parent 2db763c6f7
commit a5768c7411
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -8,34 +8,42 @@ of storing it.
@module koplugin.calibre.metadata
--]]--
local lfs = require("libs/libkoreader-lfs")
local rapidjson = require("rapidjson")
local logger = require("logger")
local parser = require("parser")
local util = require("util")
local unused_metadata = {
"application_id",
"author_link_map",
"author_sort",
"author_sort_map",
"book_producer",
"comments",
"cover",
"db_id",
"identifiers",
"languages",
"pubdate",
"publication_type",
"publisher",
"rating",
"rights",
"thumbnail",
"timestamp",
"title_sort",
"user_categories",
"user_metadata",
"_series_sort_",
local used_metadata = {
"uuid",
"lpath",
"last_modified",
"size",
"title",
"authors",
"tags",
"series",
"series_index"
}
local function slim(book)
local slim_book = {}
for _, k in ipairs(used_metadata) do
if k == "series" or k == "series_index" then
slim_book[k] = book[k] or rapidjson.null
elseif k == "tags" then
slim_book[k] = book[k] or {}
else
slim_book[k] = book[k]
end
end
return slim_book
end
-- this is the max file size we attempt to decode using json. For larger
-- files we want to attempt to manually parse the file to avoid OOM errors
local MAX_JSON_FILESIZE = 30 * 1000 * 1000
--- find calibre files for a given dir
local function findCalibreFiles(dir)
local function existOrLast(file)
@ -90,12 +98,28 @@ end
-- loads books' metadata from JSON file
function CalibreMetadata:loadBookList()
local json, err = rapidjson.load(self.metadata)
if not json then
logger.warn("Unable to load book list from JSON file:", self.metadata, err)
local attr = lfs.attributes(self.metadata)
if not attr then
logger.warn("Unable to get file attributes from JSON file:", self.metadata)
return {}
end
return json
local valid = attr.mode == "file" and attr.size > 0
if not valid then
logger.warn("File is invalid", self.metadata)
return {}
end
local books, err
if attr.size > MAX_JSON_FILESIZE then
books, err = parser.parseFile(self.metadata)
else
books, err = rapidjson.load(self.metadata)
end
if not books then
logger.warn(string.format("Unable to load library from json file %s: \n%s",
self.metadata, err))
return {}
end
return books
end
-- saves books' metadata to JSON file
@ -114,11 +138,8 @@ function CalibreMetadata:saveBookList()
end
-- add a book to our books table
function CalibreMetadata:addBook(metadata)
for _, key in pairs(unused_metadata) do
metadata[key] = nil
end
table.insert(self.books, #self.books + 1, metadata)
function CalibreMetadata:addBook(book)
table.insert(self.books, #self.books + 1, slim(book))
end
-- remove a book from our books table
@ -180,13 +201,9 @@ end
-- removes unused metadata from books
function CalibreMetadata:cleanUnused()
local slim_books = self.books
for index, _ in ipairs(slim_books) do
for _, key in pairs(unused_metadata) do
slim_books[index][key] = nil
end
for index, book in ipairs(self.books) do
self.books[index] = slim(book)
end
self.books = slim_books
self:saveBookList()
end
@ -238,14 +255,17 @@ function CalibreMetadata:init(dir, is_search)
return false
end
local deleted_count = self:prune()
local elapsed = socket.gettime() - start
logger.info(string.format(
"calibre info loaded from disk in %f milliseconds: %d books. %d pruned",
elapsed * 1000, #self.books, deleted_count))
if not is_search then
local msg
if is_search then
msg = string.format("(search) in %f milliseconds: %d books",
(socket.gettime() - start) * 1000, #self.books)
else
local deleted_count = self:prune()
self:cleanUnused()
msg = string.format("in %f milliseconds: %d books. %d pruned",
(socket.gettime() - start) * 1000, #self.books, deleted_count)
end
logger.info(string.format("calibre info loaded from disk %s", msg))
return true
end

@ -0,0 +1,90 @@
-- A parser for metadata.calibre
local util = require("util")
-- removes leading and closing characters and converts hex-unicodes
local function replaceHexChars(s, n, j)
local l = string.len(s)
if string.sub(s, l, l) == "\"" then
s = string.sub(s, n, string.len(s)-1)
else
s = string.sub(s, n, string.len(s)-j)
end
s = string.gsub(s, "\\u([a-f0-9][a-f0-9][a-f0-9][a-f0-9])", function(w)
return util.unicodeCodepointToUtf8(tonumber(w, 16))
end)
return s
end
-- a couple of string helper functions for dealing with raw json strings
local function isEqual(str, key)
if str:sub(1, key:len() + 6) == string.format(" \"%s\"", key) then
return true
end
return false
end
local function getValue(str, key)
if str == string.format(" \"%s\": null, ", key) then
return nil
else
return replaceHexChars(str, key:len() + 10, key == "series_index" and 2 or 3)
end
end
local jsonStr = getmetatable("")
jsonStr.__index["equals"] = isEqual
jsonStr.__index["value"] = getValue
local parser = {}
-- read metadata from file, line by line, and keep just the data we need
function parser.parseFile(file)
assert(type(file) == "string", "wrong type (expected a string")
local f, err = io.open(file, "rb")
if not f then
return nil, string.format("error parsing %s: %s", file, err)
end
f:close()
local add = function(t, line)
if type(t) ~= "table" or type(line) ~= "string" then
return {}
end
line = replaceHexChars(line, 8, 3)
table.insert(t, #t + 1, line)
return t
end
local books, book = {}, {}
local is_author, is_tag = false, false
for line in io.lines(file) do
if line == " }, " or line == " }" then
if type(book) == "table" then
table.insert(books, #books + 1, book)
end
book = {}
elseif line == " \"authors\": [" then
is_author = true
elseif line == " \"tags\": [" then
is_tag = true
elseif line == " ], " or line == " ]" then
is_author, is_tag = false, false
else
for _, key in ipairs({"title", "uuid", "lpath", "size",
"last_modified", "series", "series_index"})
do
if line:equals(key) then
book[key] = line:value(key)
break
end
end
end
if is_author then
book.authors = add(book.authors, line)
elseif is_tag then
book.tags = add(book.tags, line)
end
end
return books
end
return parser

@ -18,6 +18,7 @@ local Screen = require("device").screen
local UIManager = require("ui/uimanager")
local logger = require("logger")
local socket = require("socket")
local util = require("util")
local _ = require("gettext")
local T = require("ffi/util").template
@ -45,16 +46,8 @@ local function getAllMetadata(t)
end
end
for _, book in ipairs(CalibreMetadata.books) do
local slim_book = {}
slim_book.title = book.title
slim_book.lpath = book.lpath
slim_book.authors = book.authors
slim_book.series = book.series
slim_book.series_index = book.series_index
slim_book.tags = book.tags
slim_book.size = book.size
slim_book.rootpath = path
table.insert(books, #books + 1, slim_book)
book.rootpath = path
table.insert(books, #books + 1, book)
end
CalibreMetadata:clean()
end
@ -103,9 +96,11 @@ end
local function searchByTag(t, query, case_insensitive)
local freq = {}
for _, book in ipairs(t) do
for __, tag in ipairs(book.tags) do
if match(tag, query, case_insensitive) then
freq[tag] = (freq[tag] or 0) + 1
if type(book.tags) == "table" then
for __, tag in ipairs(book.tags) do
if match(tag, query, case_insensitive) then
freq[tag] = (freq[tag] or 0) + 1
end
end
end
end
@ -145,7 +140,7 @@ local function getBookInfo(book)
-- all entries can be empty, except size, which is always filled by calibre.
local title = _("Title:") .. " " .. book.title or "-"
local authors = _("Author(s):") .. " " .. getEntries(book.authors) or "-"
local size = _("Size:") .. " " .. string.format("%4.1fM", book.size/1024/1024)
local size = _("Size:") .. " " .. util.getFriendlySize(book.size) or _("Unknown")
local tags = getEntries(book.tags)
if tags then
tags = _("Tags:") .. " " .. tags
@ -329,7 +324,7 @@ function CalibreSearch:find(option)
-- measure time elapsed searching
local start = socket.gettime()
if option == "find" then
local books = self:findBooks(self.books, self.search_value)
local books = self:findBooks(self.search_value)
local result = self:bookCatalog(books)
self:showresults(result)
else
@ -346,7 +341,7 @@ function CalibreSearch:find(option)
end
-- find books with current search options
function CalibreSearch:findBooks(t, query)
function CalibreSearch:findBooks(query)
-- handle case sensitivity
local function bookMatch(s, p)
if not s or not p then return false end
@ -375,7 +370,7 @@ function CalibreSearch:findBooks(t, query)
end
-- performs a book search
local results = {}
for i, book in ipairs(t) do
for i, book in ipairs(self.books) do
if bookSearch(book, query) then
table.insert(results, #results + 1, book)
end
@ -597,7 +592,7 @@ function CalibreSearch:getMetadata()
-- try to load metadata from calibre files and dump it to cache file, if enabled.
local books = getAllMetadata(self.libraries)
if self.cache_metadata then
local dump = {}
local serialized_table = {}
local function removeNull(t)
for _, key in ipairs({"series", "series_index"}) do
if type(t[key]) == "function" then
@ -607,9 +602,9 @@ function CalibreSearch:getMetadata()
return t
end
for index, book in ipairs(books) do
table.insert(dump, index, removeNull(book))
table.insert(serialized_table, index, removeNull(book))
end
self.cache_books:save(dump)
self.cache_books:save(serialized_table)
end
local elapsed = socket.gettime() - start
logger.info(string.format(template, #books, "calibre", elapsed * 1000))

Loading…
Cancel
Save