From 2ed2c2c23da15b9335f7a664fa5cf35f55b1523b Mon Sep 17 00:00:00 2001 From: hius07 <62179190+hius07@users.noreply.github.com> Date: Sun, 15 Oct 2023 07:47:09 +0300 Subject: [PATCH] md5: centralize and deduplicate (#11003) Document partial md5 hash is calculated by util.partialMD5() and stored in doc_settings as "partial_md5_checksum" on the first document opening. --- frontend/apps/reader/readerui.lua | 18 ++++---- frontend/docsettings.lua | 14 +++---- frontend/document/document.lua | 26 ------------ frontend/util.lua | 12 +++--- plugins/kosync.koplugin/main.lua | 5 --- plugins/statistics.koplugin/main.lua | 61 +++++++--------------------- spec/unit/document_spec.lua | 6 --- spec/unit/util_spec.lua | 9 ++++ 8 files changed, 47 insertions(+), 104 deletions(-) diff --git a/frontend/apps/reader/readerui.lua b/frontend/apps/reader/readerui.lua index ab5a68512..acaa30599 100644 --- a/frontend/apps/reader/readerui.lua +++ b/frontend/apps/reader/readerui.lua @@ -464,23 +464,23 @@ function ReaderUI:init() end self.postInitCallback = nil - -- Now that document is loaded, store book metadata in settings - -- (so that filemanager can use it from sideCar file to display - -- Book information). + -- Now that document is loaded, store book metadata in settings. local props = self.document:getProps() self.doc_settings:saveSetting("doc_props", props) -- And have an extended and customized copy in memory for quick access. self.doc_props = FileManagerBookInfo.extendProps(props, self.document.file) -- Set "reading" status if there is no status. - local summary = self.doc_settings:readSetting("summary") - if not (summary and summary.status) then - if not summary then - summary = {} - end + local summary = self.doc_settings:readSetting("summary", {}) + if summary.status == nil then summary.status = "reading" summary.modified = os.date("%Y-%m-%d", os.time()) - self.doc_settings:saveSetting("summary", summary) + end + + local md5 = self.doc_settings:readSetting("partial_md5_checksum") + if md5 == nil then + md5 = util.partialMD5(self.document.file) + self.doc_settings:saveSetting("partial_md5_checksum", md5) end require("readhistory"):addItem(self.document.file) -- (will update "lastfile") diff --git a/frontend/docsettings.lua b/frontend/docsettings.lua index 4c18d6995..52ab683e5 100644 --- a/frontend/docsettings.lua +++ b/frontend/docsettings.lua @@ -20,7 +20,7 @@ local DOCSETTINGS_HASH_DIR = DataStorage:getDocSettingsHashDir() local custom_metadata_filename = "custom_metadata.lua" local is_hash_location_enabled -local hash_path_cache = {} +local doc_hash_cache = {} function DocSettings.isHashLocationEnabled() if is_hash_location_enabled == nil then @@ -93,13 +93,13 @@ function DocSettings:getSidecarDir(doc_path, force_location) if location == "dir" then path = DOCSETTINGS_DIR .. path elseif location == "hash" then - local hsh = hash_path_cache[doc_path] + local hsh = doc_hash_cache[doc_path] if not hsh then - local file = io.open(doc_path, 'rb') - if not file then return path .. ".sdr" end - hsh = util.partialMD5(file) - file:close() - hash_path_cache[doc_path] = hsh + hsh = util.partialMD5(doc_path) + if not hsh then -- fallback to "doc" + return path .. ".sdr" + end + doc_hash_cache[doc_path] = hsh logger.dbg("DocSettings: Caching new partial MD5 hash for", doc_path, "as", hsh) else logger.dbg("DocSettings: Using cached partial MD5 hash for", doc_path, "as", hsh) diff --git a/frontend/document/document.lua b/frontend/document/document.lua index ea94ebb2f..c5b3dd205 100644 --- a/frontend/document/document.lua +++ b/frontend/document/document.lua @@ -9,7 +9,6 @@ local Math = require("optmath") local TileCacheItem = require("document/tilecacheitem") local lfs = require("libs/libkoreader-lfs") local logger = require("logger") -local util = require("util") --[[ This is an abstract interface to a document @@ -144,31 +143,6 @@ function Document:discardChange() self.is_edited = false end --- calculate partial digest of the document and store in its docsettings to avoid document saving --- feature to change its checksum. -function Document:fastDigest(docsettings) - if not self.file then return end - local file = io.open(self.file, 'rb') - if file then - local tmp_docsettings = false - if not docsettings then -- if not provided, open/create it - docsettings = require("docsettings"):open(self.file) - tmp_docsettings = true - end - local result = docsettings:readSetting("partial_md5_checksum") - if not result then - logger.dbg("computing and storing partial_md5_checksum") - result = util.partialMD5(file) - docsettings:saveSetting("partial_md5_checksum", result) - end - if tmp_docsettings then - docsettings:close() - end - file:close() - return result - end -end - -- this might be overridden by a document implementation function Document:getNativePageDimensions(pageno) local hash = "pgdim|"..self.file.."|"..pageno diff --git a/frontend/util.lua b/frontend/util.lua index 0dbe0e2e7..cab70d9f1 100644 --- a/frontend/util.lua +++ b/frontend/util.lua @@ -5,6 +5,7 @@ This module contains miscellaneous helper functions for the KOReader frontend. local BaseUtil = require("ffi/util") local Utf8Proc = require("ffi/utf8proc") local lfs = require("libs/libkoreader-lfs") +local md5 = require("ffi/sha2").md5 local _ = require("gettext") local C_ = _.pgettext local T = BaseUtil.template @@ -1014,14 +1015,14 @@ end -- Note that if PDF file size is around 1024, 4096, 16384, 65536, 262144 -- 1048576, 4194304, 16777216, 67108864, 268435456 or 1073741824, appending data -- by highlighting in KOReader may change the digest value. -function util.partialMD5(file) - local bit = require("bit") - local md5 = require("ffi/sha2").md5 - local leftshift = bit.lshift +function util.partialMD5(filepath) + if not filepath then return end + local file = io.open(filepath, "rb") + if not file then return end local step, size = 1024, 1024 local update = md5() for i = -1, 10 do - file:seek("set", leftshift(step, 2*i)) + file:seek("set", lshift(step, 2*i)) local sample = file:read(size) if sample then update(sample) @@ -1029,6 +1030,7 @@ function util.partialMD5(file) break end end + file:close() return update() end diff --git a/plugins/kosync.koplugin/main.lua b/plugins/kosync.koplugin/main.lua index 22f7ca3e4..8fa9f6cde 100644 --- a/plugins/kosync.koplugin/main.lua +++ b/plugins/kosync.koplugin/main.lua @@ -169,11 +169,6 @@ function KOSync:onDispatcherRegisterActions() end function KOSync:onReaderReady() - -- Make sure checksum has been calculated before we ever query it, - -- to prevent document saving features from affecting the checksum, - -- and eventually affecting the document identity for the progress sync feature. - self.view.document:fastDigest(self.ui.doc_settings) - if self.settings.auto_sync then UIManager:nextTick(function() self:getProgress(true, false) diff --git a/plugins/statistics.koplugin/main.lua b/plugins/statistics.koplugin/main.lua index 53a8c2b9c..6353054e8 100644 --- a/plugins/statistics.koplugin/main.lua +++ b/plugins/statistics.koplugin/main.lua @@ -76,6 +76,7 @@ local ReaderStatistics = Widget:extend{ avg_time = nil, page_stat = nil, -- Dictionary, indexed by page (hash), contains a list (array) of { timestamp, duration } tuples. data = nil, -- table + doc_md5 = nil, } -- NOTE: This is used in a migration script by ui/data/onetime_migration, @@ -118,7 +119,6 @@ function ReaderStatistics:init() highlights = 0, notes = 0, pages = 0, - md5 = nil, } self.start_current_period = os.time() @@ -186,9 +186,6 @@ function ReaderStatistics:initData() self.data.series = series or "N/A" self.data.pages = self.document:getPageCount() - if not self.data.md5 then - self.data.md5 = self:partialMd5(self.document.file) - end -- Update these numbers to what's actually stored in the settings self.data.highlights, self.data.notes = self.ui.bookmark:getNumberOfHighlightsAndNotes() self.id_curr_book = self:getIdBookDB() @@ -430,29 +427,6 @@ Please wait… conn:close() end -function ReaderStatistics:partialMd5(file) - if file == nil then - return nil - end - local bit = require("bit") - local md5 = require("ffi/sha2").md5 - local lshift = bit.lshift - local step, size = 1024, 1024 - local update = md5() - local file_handle = io.open(file, 'rb') - for i = -1, 10 do - file_handle:seek("set", lshift(step, 2*i)) - local sample = file_handle:read(size) - if sample then - update(sample) - else - break - end - end - file_handle:close() - return update() -end - -- Mainly so we don't duplicate the schema twice between the creation/upgrade codepaths local STATISTICS_DB_PAGE_STAT_DATA_SCHEMA = [[ CREATE TABLE IF NOT EXISTS page_stat_data @@ -642,13 +616,14 @@ function ReaderStatistics:addBookStatToDB(book_stats, conn) AND md5 = ?; ]] local stmt = conn:prepare(sql_stmt) - local result = stmt:reset():bind(self.data.title, self.data.authors, self.data.md5):step() + local result = stmt:reset():bind(self.data.title, self.data.authors, self.doc_md5):step() local nr_id = tonumber(result[1]) if nr_id == 0 then + local partial_md5 = util.partialMD5(book_stats.file) stmt = conn:prepare("INSERT INTO book VALUES(NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);") stmt:reset():bind(book_stats.title, book_stats.authors, book_stats.notes, last_open_book, book_stats.highlights, book_stats.pages, - book_stats.series, book_stats.language, self:partialMd5(book_stats.file), total_read_time, total_read_pages) :step() + book_stats.series, book_stats.language, partial_md5, total_read_time, total_read_pages) :step() sql_stmt = [[ SELECT last_insert_rowid() AS num; ]] @@ -662,7 +637,7 @@ function ReaderStatistics:addBookStatToDB(book_stats, conn) AND md5 = ?; ]] stmt = conn:prepare(sql_stmt) - result = stmt:reset():bind(self.data.title, self.data.authors, self.data.md5):step() + result = stmt:reset():bind(self.data.title, self.data.authors, self.doc_md5):step() id_book = result[1] end @@ -781,14 +756,14 @@ function ReaderStatistics:getIdBookDB() AND md5 = ?; ]] local stmt = conn:prepare(sql_stmt) - local result = stmt:reset():bind(self.data.title, self.data.authors, self.data.md5):step() + local result = stmt:reset():bind(self.data.title, self.data.authors, self.doc_md5):step() local nr_id = tonumber(result[1]) if nr_id == 0 then -- Not in the DB yet, initialize it stmt = conn:prepare("INSERT INTO book VALUES(NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);") stmt:reset():bind(self.data.title, self.data.authors, self.data.notes, os.time(), self.data.highlights, self.data.pages, - self.data.series, self.data.language, self.data.md5, 0, 0):step() + self.data.series, self.data.language, self.doc_md5, 0, 0):step() sql_stmt = [[ SELECT last_insert_rowid() AS num; ]] @@ -802,7 +777,7 @@ function ReaderStatistics:getIdBookDB() AND md5 = ?; ]] stmt = conn:prepare(sql_stmt) - result = stmt:reset():bind(self.data.title, self.data.authors, self.data.md5):step() + result = stmt:reset():bind(self.data.title, self.data.authors, self.doc_md5):step() id_book = result[1] end stmt:close() @@ -857,17 +832,13 @@ function ReaderStatistics:onBookMetadataChanged(prop_updated) -- Not the current document: we have to find its id in the db, from the (old) title/authors/md5 local db_md5, db_title, db_authors, db_authors_legacy if DocSettings:hasSidecarFile(filepath) then - local doc_settings = DocSettings:open(filepath) - local stats = doc_settings:readSetting("stats") - if stats then - db_md5 = stats.md5 - -- Note: stats.title and stats.authors may be osbolete, if the metadata - -- has previously been updated and the document never re-opened since. - logger.dbg(log_prefix, "got md5 from docsettings:", db_md5) - end + db_md5 = DocSettings:open(filepath):readSetting("partial_md5_checksum") + -- Note: stats.title and stats.authors may be osbolete, if the metadata + -- has previously been updated and the document never re-opened since. + logger.dbg(log_prefix, "got md5 from docsettings:", db_md5) end if not db_md5 then - db_md5 = self:partialMd5(filepath) + db_md5 = util.partialMD5(filepath) logger.dbg(log_prefix, "computed md5:", db_md5) end @@ -2819,11 +2790,9 @@ function ReaderStatistics:onReadingResumed() self._reading_paused_ts = nil end -function ReaderStatistics:onReadSettings(config) +function ReaderStatistics:onReaderReady(config) self.data = config:readSetting("stats", { performance_in_pages = {} }) -end - -function ReaderStatistics:onReaderReady() + self.doc_md5 = config:readSetting("partial_md5_checksum") -- we have correct page count now, do the actual initialization work self:initData() self.view.footer:onUpdateFooter() diff --git a/spec/unit/document_spec.lua b/spec/unit/document_spec.lua index 4bb56d377..1aee76bfa 100644 --- a/spec/unit/document_spec.lua +++ b/spec/unit/document_spec.lua @@ -39,9 +39,6 @@ describe("PDF document module", function() local clip1 = doc:clipPagePNGString(pos0, pos1, pboxes, "lighten") assert.truthy(clip1) end) - it("should calculate fast digest", function() - assert.is_equal(doc:fastDigest(), "41cce710f34e5ec21315e19c99821415") - end) it("should close document", function() doc:close() end) @@ -68,9 +65,6 @@ describe("EPUB document module", function() assert.are.same(image:getWidth(), 442) assert.are.same(image:getHeight(), 616) end) - it("should calculate fast digest", function() - assert.is_equal(doc:fastDigest(), "59d481d168cca6267322f150c5f6a2a3") - end) it("should register droid sans fallback", function() local face_list = cre.getFontFaces() assert.is_equal(face_list[1], "Droid Sans Mono") diff --git a/spec/unit/util_spec.lua b/spec/unit/util_spec.lua index 9432bddcb..cc5ff08a0 100644 --- a/spec/unit/util_spec.lua +++ b/spec/unit/util_spec.lua @@ -335,6 +335,15 @@ describe("util module", function() end) end) + describe("partialMD5()", function() + it("should calculate partial md5 hash of pdf file", function() + assert.is_equal(util.partialMD5("spec/front/unit/data/tall.pdf"), "41cce710f34e5ec21315e19c99821415") + end) + it("should calculate partial md5 hash of epub file", function() + assert.is_equal(util.partialMD5("spec/front/unit/data/leaves.epub"), "59d481d168cca6267322f150c5f6a2a3") + end) + end) + describe("fixUtf8()", function() it("should replace invalid UTF-8 characters with an underscore", function() assert.is_equal("\127 _ _\127 ", util.fixUtf8("\127 \128 \194\127 ", "_"))