md5: centralize and deduplicate (#11003)

Document partial md5 hash is calculated by util.partialMD5() and stored in doc_settings as "partial_md5_checksum" on the first document opening.
reviewable/pr11010/r2
hius07 7 months ago committed by GitHub
parent e9051353a2
commit 2ed2c2c23d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -464,23 +464,23 @@ function ReaderUI:init()
end
self.postInitCallback = nil
-- Now that document is loaded, store book metadata in settings
-- (so that filemanager can use it from sideCar file to display
-- Book information).
-- Now that document is loaded, store book metadata in settings.
local props = self.document:getProps()
self.doc_settings:saveSetting("doc_props", props)
-- And have an extended and customized copy in memory for quick access.
self.doc_props = FileManagerBookInfo.extendProps(props, self.document.file)
-- Set "reading" status if there is no status.
local summary = self.doc_settings:readSetting("summary")
if not (summary and summary.status) then
if not summary then
summary = {}
end
local summary = self.doc_settings:readSetting("summary", {})
if summary.status == nil then
summary.status = "reading"
summary.modified = os.date("%Y-%m-%d", os.time())
self.doc_settings:saveSetting("summary", summary)
end
local md5 = self.doc_settings:readSetting("partial_md5_checksum")
if md5 == nil then
md5 = util.partialMD5(self.document.file)
self.doc_settings:saveSetting("partial_md5_checksum", md5)
end
require("readhistory"):addItem(self.document.file) -- (will update "lastfile")

@ -20,7 +20,7 @@ local DOCSETTINGS_HASH_DIR = DataStorage:getDocSettingsHashDir()
local custom_metadata_filename = "custom_metadata.lua"
local is_hash_location_enabled
local hash_path_cache = {}
local doc_hash_cache = {}
function DocSettings.isHashLocationEnabled()
if is_hash_location_enabled == nil then
@ -93,13 +93,13 @@ function DocSettings:getSidecarDir(doc_path, force_location)
if location == "dir" then
path = DOCSETTINGS_DIR .. path
elseif location == "hash" then
local hsh = hash_path_cache[doc_path]
local hsh = doc_hash_cache[doc_path]
if not hsh then
local file = io.open(doc_path, 'rb')
if not file then return path .. ".sdr" end
hsh = util.partialMD5(file)
file:close()
hash_path_cache[doc_path] = hsh
hsh = util.partialMD5(doc_path)
if not hsh then -- fallback to "doc"
return path .. ".sdr"
end
doc_hash_cache[doc_path] = hsh
logger.dbg("DocSettings: Caching new partial MD5 hash for", doc_path, "as", hsh)
else
logger.dbg("DocSettings: Using cached partial MD5 hash for", doc_path, "as", hsh)

@ -9,7 +9,6 @@ local Math = require("optmath")
local TileCacheItem = require("document/tilecacheitem")
local lfs = require("libs/libkoreader-lfs")
local logger = require("logger")
local util = require("util")
--[[
This is an abstract interface to a document
@ -144,31 +143,6 @@ function Document:discardChange()
self.is_edited = false
end
-- calculate partial digest of the document and store in its docsettings to avoid document saving
-- feature to change its checksum.
function Document:fastDigest(docsettings)
if not self.file then return end
local file = io.open(self.file, 'rb')
if file then
local tmp_docsettings = false
if not docsettings then -- if not provided, open/create it
docsettings = require("docsettings"):open(self.file)
tmp_docsettings = true
end
local result = docsettings:readSetting("partial_md5_checksum")
if not result then
logger.dbg("computing and storing partial_md5_checksum")
result = util.partialMD5(file)
docsettings:saveSetting("partial_md5_checksum", result)
end
if tmp_docsettings then
docsettings:close()
end
file:close()
return result
end
end
-- this might be overridden by a document implementation
function Document:getNativePageDimensions(pageno)
local hash = "pgdim|"..self.file.."|"..pageno

@ -5,6 +5,7 @@ This module contains miscellaneous helper functions for the KOReader frontend.
local BaseUtil = require("ffi/util")
local Utf8Proc = require("ffi/utf8proc")
local lfs = require("libs/libkoreader-lfs")
local md5 = require("ffi/sha2").md5
local _ = require("gettext")
local C_ = _.pgettext
local T = BaseUtil.template
@ -1014,14 +1015,14 @@ end
-- Note that if PDF file size is around 1024, 4096, 16384, 65536, 262144
-- 1048576, 4194304, 16777216, 67108864, 268435456 or 1073741824, appending data
-- by highlighting in KOReader may change the digest value.
function util.partialMD5(file)
local bit = require("bit")
local md5 = require("ffi/sha2").md5
local leftshift = bit.lshift
function util.partialMD5(filepath)
if not filepath then return end
local file = io.open(filepath, "rb")
if not file then return end
local step, size = 1024, 1024
local update = md5()
for i = -1, 10 do
file:seek("set", leftshift(step, 2*i))
file:seek("set", lshift(step, 2*i))
local sample = file:read(size)
if sample then
update(sample)
@ -1029,6 +1030,7 @@ function util.partialMD5(file)
break
end
end
file:close()
return update()
end

@ -169,11 +169,6 @@ function KOSync:onDispatcherRegisterActions()
end
function KOSync:onReaderReady()
-- Make sure checksum has been calculated before we ever query it,
-- to prevent document saving features from affecting the checksum,
-- and eventually affecting the document identity for the progress sync feature.
self.view.document:fastDigest(self.ui.doc_settings)
if self.settings.auto_sync then
UIManager:nextTick(function()
self:getProgress(true, false)

@ -76,6 +76,7 @@ local ReaderStatistics = Widget:extend{
avg_time = nil,
page_stat = nil, -- Dictionary, indexed by page (hash), contains a list (array) of { timestamp, duration } tuples.
data = nil, -- table
doc_md5 = nil,
}
-- NOTE: This is used in a migration script by ui/data/onetime_migration,
@ -118,7 +119,6 @@ function ReaderStatistics:init()
highlights = 0,
notes = 0,
pages = 0,
md5 = nil,
}
self.start_current_period = os.time()
@ -186,9 +186,6 @@ function ReaderStatistics:initData()
self.data.series = series or "N/A"
self.data.pages = self.document:getPageCount()
if not self.data.md5 then
self.data.md5 = self:partialMd5(self.document.file)
end
-- Update these numbers to what's actually stored in the settings
self.data.highlights, self.data.notes = self.ui.bookmark:getNumberOfHighlightsAndNotes()
self.id_curr_book = self:getIdBookDB()
@ -430,29 +427,6 @@ Please wait…
conn:close()
end
function ReaderStatistics:partialMd5(file)
if file == nil then
return nil
end
local bit = require("bit")
local md5 = require("ffi/sha2").md5
local lshift = bit.lshift
local step, size = 1024, 1024
local update = md5()
local file_handle = io.open(file, 'rb')
for i = -1, 10 do
file_handle:seek("set", lshift(step, 2*i))
local sample = file_handle:read(size)
if sample then
update(sample)
else
break
end
end
file_handle:close()
return update()
end
-- Mainly so we don't duplicate the schema twice between the creation/upgrade codepaths
local STATISTICS_DB_PAGE_STAT_DATA_SCHEMA = [[
CREATE TABLE IF NOT EXISTS page_stat_data
@ -642,13 +616,14 @@ function ReaderStatistics:addBookStatToDB(book_stats, conn)
AND md5 = ?;
]]
local stmt = conn:prepare(sql_stmt)
local result = stmt:reset():bind(self.data.title, self.data.authors, self.data.md5):step()
local result = stmt:reset():bind(self.data.title, self.data.authors, self.doc_md5):step()
local nr_id = tonumber(result[1])
if nr_id == 0 then
local partial_md5 = util.partialMD5(book_stats.file)
stmt = conn:prepare("INSERT INTO book VALUES(NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);")
stmt:reset():bind(book_stats.title, book_stats.authors, book_stats.notes,
last_open_book, book_stats.highlights, book_stats.pages,
book_stats.series, book_stats.language, self:partialMd5(book_stats.file), total_read_time, total_read_pages) :step()
book_stats.series, book_stats.language, partial_md5, total_read_time, total_read_pages) :step()
sql_stmt = [[
SELECT last_insert_rowid() AS num;
]]
@ -662,7 +637,7 @@ function ReaderStatistics:addBookStatToDB(book_stats, conn)
AND md5 = ?;
]]
stmt = conn:prepare(sql_stmt)
result = stmt:reset():bind(self.data.title, self.data.authors, self.data.md5):step()
result = stmt:reset():bind(self.data.title, self.data.authors, self.doc_md5):step()
id_book = result[1]
end
@ -781,14 +756,14 @@ function ReaderStatistics:getIdBookDB()
AND md5 = ?;
]]
local stmt = conn:prepare(sql_stmt)
local result = stmt:reset():bind(self.data.title, self.data.authors, self.data.md5):step()
local result = stmt:reset():bind(self.data.title, self.data.authors, self.doc_md5):step()
local nr_id = tonumber(result[1])
if nr_id == 0 then
-- Not in the DB yet, initialize it
stmt = conn:prepare("INSERT INTO book VALUES(NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);")
stmt:reset():bind(self.data.title, self.data.authors, self.data.notes,
os.time(), self.data.highlights, self.data.pages,
self.data.series, self.data.language, self.data.md5, 0, 0):step()
self.data.series, self.data.language, self.doc_md5, 0, 0):step()
sql_stmt = [[
SELECT last_insert_rowid() AS num;
]]
@ -802,7 +777,7 @@ function ReaderStatistics:getIdBookDB()
AND md5 = ?;
]]
stmt = conn:prepare(sql_stmt)
result = stmt:reset():bind(self.data.title, self.data.authors, self.data.md5):step()
result = stmt:reset():bind(self.data.title, self.data.authors, self.doc_md5):step()
id_book = result[1]
end
stmt:close()
@ -857,17 +832,13 @@ function ReaderStatistics:onBookMetadataChanged(prop_updated)
-- Not the current document: we have to find its id in the db, from the (old) title/authors/md5
local db_md5, db_title, db_authors, db_authors_legacy
if DocSettings:hasSidecarFile(filepath) then
local doc_settings = DocSettings:open(filepath)
local stats = doc_settings:readSetting("stats")
if stats then
db_md5 = stats.md5
-- Note: stats.title and stats.authors may be osbolete, if the metadata
-- has previously been updated and the document never re-opened since.
logger.dbg(log_prefix, "got md5 from docsettings:", db_md5)
end
db_md5 = DocSettings:open(filepath):readSetting("partial_md5_checksum")
-- Note: stats.title and stats.authors may be osbolete, if the metadata
-- has previously been updated and the document never re-opened since.
logger.dbg(log_prefix, "got md5 from docsettings:", db_md5)
end
if not db_md5 then
db_md5 = self:partialMd5(filepath)
db_md5 = util.partialMD5(filepath)
logger.dbg(log_prefix, "computed md5:", db_md5)
end
@ -2819,11 +2790,9 @@ function ReaderStatistics:onReadingResumed()
self._reading_paused_ts = nil
end
function ReaderStatistics:onReadSettings(config)
function ReaderStatistics:onReaderReady(config)
self.data = config:readSetting("stats", { performance_in_pages = {} })
end
function ReaderStatistics:onReaderReady()
self.doc_md5 = config:readSetting("partial_md5_checksum")
-- we have correct page count now, do the actual initialization work
self:initData()
self.view.footer:onUpdateFooter()

@ -39,9 +39,6 @@ describe("PDF document module", function()
local clip1 = doc:clipPagePNGString(pos0, pos1, pboxes, "lighten")
assert.truthy(clip1)
end)
it("should calculate fast digest", function()
assert.is_equal(doc:fastDigest(), "41cce710f34e5ec21315e19c99821415")
end)
it("should close document", function()
doc:close()
end)
@ -68,9 +65,6 @@ describe("EPUB document module", function()
assert.are.same(image:getWidth(), 442)
assert.are.same(image:getHeight(), 616)
end)
it("should calculate fast digest", function()
assert.is_equal(doc:fastDigest(), "59d481d168cca6267322f150c5f6a2a3")
end)
it("should register droid sans fallback", function()
local face_list = cre.getFontFaces()
assert.is_equal(face_list[1], "Droid Sans Mono")

@ -335,6 +335,15 @@ describe("util module", function()
end)
end)
describe("partialMD5()", function()
it("should calculate partial md5 hash of pdf file", function()
assert.is_equal(util.partialMD5("spec/front/unit/data/tall.pdf"), "41cce710f34e5ec21315e19c99821415")
end)
it("should calculate partial md5 hash of epub file", function()
assert.is_equal(util.partialMD5("spec/front/unit/data/leaves.epub"), "59d481d168cca6267322f150c5f6a2a3")
end)
end)
describe("fixUtf8()", function()
it("should replace invalid UTF-8 characters with an underscore", function()
assert.is_equal("\127 _ _\127 ", util.fixUtf8("\127 \128 \194\127 ", "_"))

Loading…
Cancel
Save