From b80dd1f966214f5db8ba9c89920e92713438d9dc Mon Sep 17 00:00:00 2001 From: chrox Date: Mon, 17 Nov 2014 17:58:25 +0800 Subject: [PATCH] add fulltext search for PDF documents --- frontend/apps/reader/modules/readersearch.lua | 12 +- frontend/document/credocument.lua | 2 +- frontend/document/djvudocument.lua | 4 + frontend/document/koptinterface.lua | 117 ++++++++++++++++-- frontend/document/pdfdocument.lua | 4 + spec/unit/readersearch_spec.lua | 100 +++++++++++++++ 6 files changed, 225 insertions(+), 14 deletions(-) diff --git a/frontend/apps/reader/modules/readersearch.lua b/frontend/apps/reader/modules/readersearch.lua index ce35ea661..8851d0156 100644 --- a/frontend/apps/reader/modules/readersearch.lua +++ b/frontend/apps/reader/modules/readersearch.lua @@ -8,7 +8,7 @@ local _ = require("gettext") local ReaderSearch = InputContainer:new{ direction = 0, -- 0 for search forward, 1 for search backward - case_insensitive = 1, -- default to case insensitive + case_insensitive = true, -- default to case insensitive } function ReaderSearch:init() @@ -33,7 +33,12 @@ function ReaderSearch:onShowSearchDialog(text) return function() local res = search_func(self, text, param) if res then - self.ui.link:onGotoLink(res[1].start) + if self.ui.document.info.has_pages then + self.ui.link:onGotoLink({page = res.page - 1}) + self.view.highlight.temp[res.page] = res + else + self.ui.link:onGotoLink(res[1].start) + end end end end @@ -73,7 +78,8 @@ end function ReaderSearch:search(pattern, origin) local direction = self.direction local case = self.case_insensitive - return self.ui.document:findText(pattern, origin, direction, case) + local page = self.view.state.page + return self.ui.document:findText(pattern, origin, direction, case, page) end function ReaderSearch:searchFromStart(pattern) diff --git a/frontend/document/credocument.lua b/frontend/document/credocument.lua index 425a089ff..3e4a7f0eb 100644 --- a/frontend/document/credocument.lua +++ b/frontend/document/credocument.lua @@ -430,7 +430,7 @@ end function CreDocument:findText(pattern, origin, reverse, caseInsensitive) DEBUG("CreDocument: find text", pattern, origin, reverse, caseInsensitive) - return self._document:findText(pattern, origin, reverse, caseInsensitive) + return self._document:findText(pattern, origin, reverse, caseInsensitive and 1 or 0) end function CreDocument:register(registry) diff --git a/frontend/document/djvudocument.lua b/frontend/document/djvudocument.lua index d91e38f2e..084213965 100644 --- a/frontend/document/djvudocument.lua +++ b/frontend/document/djvudocument.lua @@ -105,6 +105,10 @@ function DjvuDocument:getCoverPageImage() return self.koptinterface:getCoverPageImage(self) end +function DjvuDocument:findText(pattern, origin, reverse, caseInsensitive, page) + return self.koptinterface:findText(self, pattern, origin, reverse, caseInsensitive, page) +end + function DjvuDocument:renderPage(pageno, rect, zoom, rotation, gamma, render_mode) return self.koptinterface:renderPage(self, pageno, rect, zoom, rotation, gamma, render_mode) end diff --git a/frontend/document/koptinterface.lua b/frontend/document/koptinterface.lua index d3cf90d6a..268ef0abf 100644 --- a/frontend/document/koptinterface.lua +++ b/frontend/document/koptinterface.lua @@ -934,11 +934,7 @@ end transform position in native page to reflowed page ]]-- function KoptInterface:nativeToReflowPosTransform(doc, pageno, pos) - local bbox = doc:getPageBBox(pageno) - local context_hash = self:getContextHash(doc, pageno, bbox) - local kctx_hash = "kctx|"..context_hash - local cached = Cache:check(kctx_hash) - local kc = self:waitForContext(cached.kctx) + local kc = self:getCachedContext(doc, pageno) --DEBUG("transform native pos", pos) local rpos = {} rpos.x, rpos.y = kc:nativeToReflowPosTransform(pos.x, pos.y) @@ -950,11 +946,7 @@ end transform position in reflowed page to native page ]]-- function KoptInterface:reflowToNativePosTransform(doc, pageno, abs_pos, rel_pos) - local bbox = doc:getPageBBox(pageno) - local context_hash = self:getContextHash(doc, pageno, bbox) - local kctx_hash = "kctx|"..context_hash - local cached = Cache:check(kctx_hash) - local kc = self:waitForContext(cached.kctx) + local kc = self:getCachedContext(doc, pageno) --kc:setDebug() --DEBUG("transform reflowed pos", abs_pos, rel_pos) local npos = {} @@ -1073,6 +1065,111 @@ function KoptInterface:nativeToPageRectTransform(doc, pageno, rect) end end +local function all_matches(boxes, pattern, caseInsensitive) + -- pattern list of single words + local plist = {} + -- split utf-8 characters + for words in pattern:gmatch("[\32-\127\192-\255]+[\128-\191]*") do + -- split space seperated words + for word in words:gmatch("[^%s]+") do + table.insert(plist, caseInsensitive and word:lower() or word) + end + end + -- return mached word indices from index i, j + local function match(i, j) + local pindex = 1 + local matched_indices = {} + while true do + if #boxes[i] < j then + j = j - #boxes[i] + i = i + 1 + end + if i > #boxes then break end + local box = boxes[i][j] + local word = caseInsensitive and box.word:lower() or box.word + if word:match(plist[pindex]) then + table.insert(matched_indices, {i, j}) + if pindex == #plist then + return matched_indices + else + j = j + 1 + pindex = pindex + 1 + end + else + break + end + end + end + return coroutine.wrap(function() + for i, line in ipairs(boxes) do + for j, box in ipairs(line) do + local matches = match(i, j) + if matches then + coroutine.yield(matches) + end + end + end + end) +end + +function KoptInterface:findAllMatches(doc, pattern, caseInsensitive, page) + local text_boxes = doc:getPageTextBoxes(page) + if not text_boxes then return end + --DEBUG("boxes", text_boxes) + local matches = {} + for indices in all_matches(text_boxes or {}, pattern, caseInsensitive) do + for _, index in ipairs(indices) do + local i, j = unpack(index) + local word = text_boxes[i][j] + local word_box = { + x = word.x0, y = word.y0, + w = word.x1 - word.x0, + h = word.y1 - word.y0, + } + -- rects will be transformed to reflowed page rects if needed + table.insert(matches, self:nativeToPageRectTransform(doc, page, word_box)) + end + end + return matches +end + +function KoptInterface:findText(doc, pattern, origin, reverse, caseInsensitive, pageno) + DEBUG("Koptinterface: find text", pattern, origin, reverse, caseInsensitive, pageno) + local last_pageno = doc:getPageCount() + local start_page, end_page + if reverse == 1 then + -- backward + if origin == 0 then + -- from end of current page to first page + start_page, end_page = pageno, 1 + elseif origin == -1 then + -- from the last page to end of current page + start_page, end_page = last_pageno, pageno + 1 + elseif origin == 1 then + start_page, end_page = pageno - 1, 1 + end + else + -- forward + if origin == 0 then + -- from current page to the last page + start_page, end_page = pageno, last_pageno + elseif origin == -1 then + -- from the first page to current page + start_page, end_page = 1, pageno - 1 + elseif origin == 1 then + -- from next page to the last page + start_page, end_page = pageno + 1, last_pageno + end + end + for i = start_page, end_page, (reverse == 1) and -1 or 1 do + local matches = self:findAllMatches(doc, pattern, caseInsensitive, i) + if #matches > 0 then + matches.page = i + return matches + end + end +end + --[[ helper functions --]] diff --git a/frontend/document/pdfdocument.lua b/frontend/document/pdfdocument.lua index 706f2a706..bcf6f2622 100644 --- a/frontend/document/pdfdocument.lua +++ b/frontend/document/pdfdocument.lua @@ -183,6 +183,10 @@ function PdfDocument:getCoverPageImage() return self.koptinterface:getCoverPageImage(self) end +function PdfDocument:findText(pattern, origin, reverse, caseInsensitive, page) + return self.koptinterface:findText(self, pattern, origin, reverse, caseInsensitive, page) +end + function PdfDocument:renderPage(pageno, rect, zoom, rotation, gamma, render_mode) return self.koptinterface:renderPage(self, pageno, rect, zoom, rotation, gamma, render_mode) end diff --git a/spec/unit/readersearch_spec.lua b/spec/unit/readersearch_spec.lua index 7c982277c..7b05c3a04 100644 --- a/spec/unit/readersearch_spec.lua +++ b/spec/unit/readersearch_spec.lua @@ -4,6 +4,7 @@ local ReaderUI = require("apps/reader/readerui") local DEBUG = require("dbg") local sample_epub = "spec/front/unit/data/juliet.epub" +local sample_pdf = "spec/front/unit/data/sample.pdf" describe("Readersearch module", function() describe("search API for EPUB documents", function() @@ -90,5 +91,104 @@ describe("Readersearch module", function() end) end) describe("search API for PDF documents", function() + local doc, search, paging + setup(function() + local readerui = ReaderUI:new{ + document = DocumentRegistry:openDocument(sample_pdf), + } + doc = readerui.document + search = readerui.search + paging = readerui.paging + end) + it("should match single word with case insensitive option in one page", function() + assert.are.equal(9, #doc.koptinterface:findAllMatches(doc, "what", true, 20)) + assert.are.equal(51, #doc.koptinterface:findAllMatches(doc, "the", true, 20)) + assert.are.equal(0, #doc.koptinterface:findAllMatches(doc, "xxxx", true, 20)) + end) + it("should match single word with case sensitive option in one page", function() + assert.are.equal(7, #doc.koptinterface:findAllMatches(doc, "what", false, 20)) + assert.are.equal(49, #doc.koptinterface:findAllMatches(doc, "the", false, 20)) + assert.are.equal(0, #doc.koptinterface:findAllMatches(doc, "xxxx", false, 20)) + end) + it("should match phrase in one page", function() + assert.are.equal(2*2, #doc.koptinterface:findAllMatches(doc, "mean that", true, 20)) + end) + it("should match whole phrase in one page", function() + assert.are.equal(1*3, #doc.koptinterface:findAllMatches(doc, "mean that the", true, 20)) + end) + it("should match with lua pattern", function() + assert.are.equal(7*1, #doc.koptinterface:findAllMatches(doc, "chapter", true, 30)) + assert.are.equal(3*2, #doc.koptinterface:findAllMatches(doc, "chapter %d", true, 30)) + assert.are.equal(2*2, #doc.koptinterface:findAllMatches(doc, "chapter %d%d", true, 30)) + assert.are.equal(0*2, #doc.koptinterface:findAllMatches(doc, "chapter %d%d%d", true, 30)) + end) + it("should not match empty string", function() + assert.are.equal(0, #doc.koptinterface:findAllMatches(doc, "", true, 1)) + end) + it("should not match on page without text layer", function() + assert.are.equal(0, #doc.koptinterface:findAllMatches(doc, "e", true, 1)) + end) + it("should search backward", function() + paging:gotoPage(20) + assert.truthy(search:searchFromCurrent("test", 1)) + for i = 1, 40, 10 do + paging:gotoPage(i) + local words = search:searchFromCurrent("test", 1) + if words then + DEBUG("search backward: found at page", words.page) + assert.truthy(words.page <= i) + end + end + end) + it("should search forward", function() + paging:gotoPage(20) + assert.truthy(search:searchFromCurrent("test", 0)) + for i = 1, 40, 10 do + paging:gotoPage(i) + local words = search:searchFromCurrent("test", 0) + if words then + DEBUG("search forward: found at page", words.page) + assert.truthy(words.page >= i) + end + end + end) + it("should find the first occurrence", function() + for i = 20, 40, 10 do + paging:gotoPage(i) + local words = search:searchFromStart("test") + assert.truthy(words) + assert.are.equal(10, words.page) + end + for i = 1, 10, 2 do + paging:gotoPage(i) + local words = search:searchFromStart("test") + assert(words == nil) + end + end) + it("should find the last occurrence", function() + for i = 10, 30, 10 do + paging:gotoPage(i) + local words = search:searchFromEnd("test") + assert.truthy(words) + assert.are.equal(32, words.page) + end + for i = 40, 50, 2 do + paging:gotoPage(i) + local words = search:searchFromEnd("test") + assert(words == nil) + end + end) + it("should find all occurrences", function() + local count = 0 + paging:gotoPage(1) + local words = search:searchFromCurrent("test", 0) + while words do + count = count + #words + --DEBUG("found words", #words, words.page) + paging:gotoPage(words.page) + words = search:searchNext("test", 0) + end + assert.are.equal(11, count) + end) end) end)