add fulltext search for PDF documents

pull/1194/head
chrox 10 years ago
parent ae65d0ae09
commit b80dd1f966

@ -8,7 +8,7 @@ local _ = require("gettext")
local ReaderSearch = InputContainer:new{
direction = 0, -- 0 for search forward, 1 for search backward
case_insensitive = 1, -- default to case insensitive
case_insensitive = true, -- default to case insensitive
}
function ReaderSearch:init()
@ -33,7 +33,12 @@ function ReaderSearch:onShowSearchDialog(text)
return function()
local res = search_func(self, text, param)
if res then
self.ui.link:onGotoLink(res[1].start)
if self.ui.document.info.has_pages then
self.ui.link:onGotoLink({page = res.page - 1})
self.view.highlight.temp[res.page] = res
else
self.ui.link:onGotoLink(res[1].start)
end
end
end
end
@ -73,7 +78,8 @@ end
function ReaderSearch:search(pattern, origin)
local direction = self.direction
local case = self.case_insensitive
return self.ui.document:findText(pattern, origin, direction, case)
local page = self.view.state.page
return self.ui.document:findText(pattern, origin, direction, case, page)
end
function ReaderSearch:searchFromStart(pattern)

@ -430,7 +430,7 @@ end
function CreDocument:findText(pattern, origin, reverse, caseInsensitive)
DEBUG("CreDocument: find text", pattern, origin, reverse, caseInsensitive)
return self._document:findText(pattern, origin, reverse, caseInsensitive)
return self._document:findText(pattern, origin, reverse, caseInsensitive and 1 or 0)
end
function CreDocument:register(registry)

@ -105,6 +105,10 @@ function DjvuDocument:getCoverPageImage()
return self.koptinterface:getCoverPageImage(self)
end
function DjvuDocument:findText(pattern, origin, reverse, caseInsensitive, page)
return self.koptinterface:findText(self, pattern, origin, reverse, caseInsensitive, page)
end
function DjvuDocument:renderPage(pageno, rect, zoom, rotation, gamma, render_mode)
return self.koptinterface:renderPage(self, pageno, rect, zoom, rotation, gamma, render_mode)
end

@ -934,11 +934,7 @@ end
transform position in native page to reflowed page
]]--
function KoptInterface:nativeToReflowPosTransform(doc, pageno, pos)
local bbox = doc:getPageBBox(pageno)
local context_hash = self:getContextHash(doc, pageno, bbox)
local kctx_hash = "kctx|"..context_hash
local cached = Cache:check(kctx_hash)
local kc = self:waitForContext(cached.kctx)
local kc = self:getCachedContext(doc, pageno)
--DEBUG("transform native pos", pos)
local rpos = {}
rpos.x, rpos.y = kc:nativeToReflowPosTransform(pos.x, pos.y)
@ -950,11 +946,7 @@ end
transform position in reflowed page to native page
]]--
function KoptInterface:reflowToNativePosTransform(doc, pageno, abs_pos, rel_pos)
local bbox = doc:getPageBBox(pageno)
local context_hash = self:getContextHash(doc, pageno, bbox)
local kctx_hash = "kctx|"..context_hash
local cached = Cache:check(kctx_hash)
local kc = self:waitForContext(cached.kctx)
local kc = self:getCachedContext(doc, pageno)
--kc:setDebug()
--DEBUG("transform reflowed pos", abs_pos, rel_pos)
local npos = {}
@ -1073,6 +1065,111 @@ function KoptInterface:nativeToPageRectTransform(doc, pageno, rect)
end
end
local function all_matches(boxes, pattern, caseInsensitive)
-- pattern list of single words
local plist = {}
-- split utf-8 characters
for words in pattern:gmatch("[\32-\127\192-\255]+[\128-\191]*") do
-- split space seperated words
for word in words:gmatch("[^%s]+") do
table.insert(plist, caseInsensitive and word:lower() or word)
end
end
-- return mached word indices from index i, j
local function match(i, j)
local pindex = 1
local matched_indices = {}
while true do
if #boxes[i] < j then
j = j - #boxes[i]
i = i + 1
end
if i > #boxes then break end
local box = boxes[i][j]
local word = caseInsensitive and box.word:lower() or box.word
if word:match(plist[pindex]) then
table.insert(matched_indices, {i, j})
if pindex == #plist then
return matched_indices
else
j = j + 1
pindex = pindex + 1
end
else
break
end
end
end
return coroutine.wrap(function()
for i, line in ipairs(boxes) do
for j, box in ipairs(line) do
local matches = match(i, j)
if matches then
coroutine.yield(matches)
end
end
end
end)
end
function KoptInterface:findAllMatches(doc, pattern, caseInsensitive, page)
local text_boxes = doc:getPageTextBoxes(page)
if not text_boxes then return end
--DEBUG("boxes", text_boxes)
local matches = {}
for indices in all_matches(text_boxes or {}, pattern, caseInsensitive) do
for _, index in ipairs(indices) do
local i, j = unpack(index)
local word = text_boxes[i][j]
local word_box = {
x = word.x0, y = word.y0,
w = word.x1 - word.x0,
h = word.y1 - word.y0,
}
-- rects will be transformed to reflowed page rects if needed
table.insert(matches, self:nativeToPageRectTransform(doc, page, word_box))
end
end
return matches
end
function KoptInterface:findText(doc, pattern, origin, reverse, caseInsensitive, pageno)
DEBUG("Koptinterface: find text", pattern, origin, reverse, caseInsensitive, pageno)
local last_pageno = doc:getPageCount()
local start_page, end_page
if reverse == 1 then
-- backward
if origin == 0 then
-- from end of current page to first page
start_page, end_page = pageno, 1
elseif origin == -1 then
-- from the last page to end of current page
start_page, end_page = last_pageno, pageno + 1
elseif origin == 1 then
start_page, end_page = pageno - 1, 1
end
else
-- forward
if origin == 0 then
-- from current page to the last page
start_page, end_page = pageno, last_pageno
elseif origin == -1 then
-- from the first page to current page
start_page, end_page = 1, pageno - 1
elseif origin == 1 then
-- from next page to the last page
start_page, end_page = pageno + 1, last_pageno
end
end
for i = start_page, end_page, (reverse == 1) and -1 or 1 do
local matches = self:findAllMatches(doc, pattern, caseInsensitive, i)
if #matches > 0 then
matches.page = i
return matches
end
end
end
--[[
helper functions
--]]

@ -183,6 +183,10 @@ function PdfDocument:getCoverPageImage()
return self.koptinterface:getCoverPageImage(self)
end
function PdfDocument:findText(pattern, origin, reverse, caseInsensitive, page)
return self.koptinterface:findText(self, pattern, origin, reverse, caseInsensitive, page)
end
function PdfDocument:renderPage(pageno, rect, zoom, rotation, gamma, render_mode)
return self.koptinterface:renderPage(self, pageno, rect, zoom, rotation, gamma, render_mode)
end

@ -4,6 +4,7 @@ local ReaderUI = require("apps/reader/readerui")
local DEBUG = require("dbg")
local sample_epub = "spec/front/unit/data/juliet.epub"
local sample_pdf = "spec/front/unit/data/sample.pdf"
describe("Readersearch module", function()
describe("search API for EPUB documents", function()
@ -90,5 +91,104 @@ describe("Readersearch module", function()
end)
end)
describe("search API for PDF documents", function()
local doc, search, paging
setup(function()
local readerui = ReaderUI:new{
document = DocumentRegistry:openDocument(sample_pdf),
}
doc = readerui.document
search = readerui.search
paging = readerui.paging
end)
it("should match single word with case insensitive option in one page", function()
assert.are.equal(9, #doc.koptinterface:findAllMatches(doc, "what", true, 20))
assert.are.equal(51, #doc.koptinterface:findAllMatches(doc, "the", true, 20))
assert.are.equal(0, #doc.koptinterface:findAllMatches(doc, "xxxx", true, 20))
end)
it("should match single word with case sensitive option in one page", function()
assert.are.equal(7, #doc.koptinterface:findAllMatches(doc, "what", false, 20))
assert.are.equal(49, #doc.koptinterface:findAllMatches(doc, "the", false, 20))
assert.are.equal(0, #doc.koptinterface:findAllMatches(doc, "xxxx", false, 20))
end)
it("should match phrase in one page", function()
assert.are.equal(2*2, #doc.koptinterface:findAllMatches(doc, "mean that", true, 20))
end)
it("should match whole phrase in one page", function()
assert.are.equal(1*3, #doc.koptinterface:findAllMatches(doc, "mean that the", true, 20))
end)
it("should match with lua pattern", function()
assert.are.equal(7*1, #doc.koptinterface:findAllMatches(doc, "chapter", true, 30))
assert.are.equal(3*2, #doc.koptinterface:findAllMatches(doc, "chapter %d", true, 30))
assert.are.equal(2*2, #doc.koptinterface:findAllMatches(doc, "chapter %d%d", true, 30))
assert.are.equal(0*2, #doc.koptinterface:findAllMatches(doc, "chapter %d%d%d", true, 30))
end)
it("should not match empty string", function()
assert.are.equal(0, #doc.koptinterface:findAllMatches(doc, "", true, 1))
end)
it("should not match on page without text layer", function()
assert.are.equal(0, #doc.koptinterface:findAllMatches(doc, "e", true, 1))
end)
it("should search backward", function()
paging:gotoPage(20)
assert.truthy(search:searchFromCurrent("test", 1))
for i = 1, 40, 10 do
paging:gotoPage(i)
local words = search:searchFromCurrent("test", 1)
if words then
DEBUG("search backward: found at page", words.page)
assert.truthy(words.page <= i)
end
end
end)
it("should search forward", function()
paging:gotoPage(20)
assert.truthy(search:searchFromCurrent("test", 0))
for i = 1, 40, 10 do
paging:gotoPage(i)
local words = search:searchFromCurrent("test", 0)
if words then
DEBUG("search forward: found at page", words.page)
assert.truthy(words.page >= i)
end
end
end)
it("should find the first occurrence", function()
for i = 20, 40, 10 do
paging:gotoPage(i)
local words = search:searchFromStart("test")
assert.truthy(words)
assert.are.equal(10, words.page)
end
for i = 1, 10, 2 do
paging:gotoPage(i)
local words = search:searchFromStart("test")
assert(words == nil)
end
end)
it("should find the last occurrence", function()
for i = 10, 30, 10 do
paging:gotoPage(i)
local words = search:searchFromEnd("test")
assert.truthy(words)
assert.are.equal(32, words.page)
end
for i = 40, 50, 2 do
paging:gotoPage(i)
local words = search:searchFromEnd("test")
assert(words == nil)
end
end)
it("should find all occurrences", function()
local count = 0
paging:gotoPage(1)
local words = search:searchFromCurrent("test", 0)
while words do
count = count + #words
--DEBUG("found words", #words, words.page)
paging:gotoPage(words.page)
words = search:searchNext("test", 0)
end
assert.are.equal(11, count)
end)
end)
end)

Loading…
Cancel
Save