From 7c78e83e4969925d9e3558ff9fa207f00e64b070 Mon Sep 17 00:00:00 2001 From: chrox Date: Wed, 1 May 2013 23:43:53 +0800 Subject: [PATCH] add hightlight/dictionary lookup in scanned pdf/djvu --- frontend/document/djvudocument.lua | 18 ++++--- frontend/document/document.lua | 2 +- frontend/document/koptinterface.lua | 69 ++++++++++++++++++++++---- frontend/document/pdfdocument.lua | 17 ++++--- frontend/ui/reader/readerhighlight.lua | 53 ++++++++++---------- 5 files changed, 107 insertions(+), 52 deletions(-) diff --git a/frontend/document/djvudocument.lua b/frontend/document/djvudocument.lua index 2c386a3be..c6b723e51 100644 --- a/frontend/document/djvudocument.lua +++ b/frontend/document/djvudocument.lua @@ -47,22 +47,24 @@ function validDjvuFile(filename) return true end -function DjvuDocument:getPageText(pageno) +function DjvuDocument:getTextBoxes(pageno) if self.configurable.text_wrap == 1 then - return self.koptinterface:getPageText(self, pageno) + return self.koptinterface:getReflewTextBoxes(self, pageno) else - return self._document:getPageText(pageno) + local text = self._document:getPageText(pageno) + if not text or #text == 0 then + return self.koptinterface:getTextBoxes(self, pageno) + else + return text + end end end function DjvuDocument:getOCRWord(pageno, rect) if self.configurable.text_wrap == 1 then - return self.koptinterface:getOCRWord(self, pageno, rect) + return self.koptinterface:getReflewOCRWord(self, pageno, rect) else - --local page = self._document:openPage(pageno) - --local word = page:getOCRWord(rect) - --page:close() - --return word + return self.koptinterface:getOCRWord(self, pageno, rect) end end diff --git a/frontend/document/document.lua b/frontend/document/document.lua index 5e53ecb8f..d12a057da 100644 --- a/frontend/document/document.lua +++ b/frontend/document/document.lua @@ -184,7 +184,7 @@ function Document:getToc() return self._document:getToc() end -function Document:getPageText(pageno) +function Document:getTextBoxes(pageno) return nil end diff --git a/frontend/document/koptinterface.lua b/frontend/document/koptinterface.lua index 6628bc476..c9df1e10e 100644 --- a/frontend/document/koptinterface.lua +++ b/frontend/document/koptinterface.lua @@ -95,10 +95,10 @@ function KoptInterface:getAutoBBox(doc, pageno) end end -function KoptInterface:getPageText(doc, pageno) +function KoptInterface:getReflewTextBoxes(doc, pageno) local bbox = doc:getPageBBox(pageno) local context_hash = self:getContextHash(doc, pageno, bbox) - local hash = "pgtext|"..context_hash + local hash = "rfpgboxes|"..context_hash local cached = Cache:check(hash) if not cached then local kctx_hash = "kctx|"..context_hash @@ -106,16 +106,38 @@ function KoptInterface:getPageText(doc, pageno) if cached then local kc = self:waitForContext(cached.kctx) local fullwidth, fullheight = kc:getPageDim() - local text = kc:getWordBoxes(0, 0, fullwidth, fullheight) - Cache:insert(hash, CacheItem:new{ pgtext = text }) - return text + local boxes = kc:getWordBoxes(0, 0, fullwidth, fullheight) + Cache:insert(hash, CacheItem:new{ rfpgboxes = boxes }) + return boxes end else - return cached.pgtext + return cached.rfpgboxes end end -function KoptInterface:getOCRWord(doc, pageno, rect) +function KoptInterface:getTextBoxes(doc, pageno) + local hash = "pgboxes|"..doc.file.."|"..pageno + local cached = Cache:check(hash) + if not cached then + local kc_hash = "kctx|"..doc.file.."|"..pageno + local kc = KOPTContext.new() + kc:setDebug() + local page = doc._document:openPage(pageno) + page:getPagePix(kc) + local fullwidth, fullheight = kc:getPageDim() + local boxes = kc:getWordBoxes(0, 0, fullwidth, fullheight) + Cache:insert(hash, CacheItem:new{ pgboxes = boxes }) + Cache:insert(kc_hash, ContextCacheItem:new{ kctx = kc }) + return boxes + else + return cached.pgboxes + end +end + +--[[ +get word from OCR in reflew page +--]] +function KoptInterface:getReflewOCRWord(doc, pageno, rect) local ocrengine = "ocrengine" if not Cache:check(ocrengine) then local dummy = KOPTContext.new() @@ -123,14 +145,43 @@ function KoptInterface:getOCRWord(doc, pageno, rect) end local bbox = doc:getPageBBox(pageno) local context_hash = self:getContextHash(doc, pageno, bbox) - local hash = "ocrword|"..context_hash..rect.x..rect.y..rect.w..rect.h + local hash = "rfocrword|"..context_hash..rect.x..rect.y..rect.w..rect.h local cached = Cache:check(hash) if not cached then local kctx_hash = "kctx|"..context_hash local cached = Cache:check(kctx_hash) if cached then local kc = self:waitForContext(cached.kctx) - local fullwidth, fullheight = kc:getPageDim() + local ok, word = pcall( + kc.getTOCRWord, kc, + rect.x, rect.y, rect.w, rect.h, + self.tessocr_data, self.ocr_lang, self.ocr_type, 0, 1) + Cache:insert(hash, CacheItem:new{ rfocrword = word }) + return word + end + else + return cached.rfocrword + end +end + +--[[ +get word from OCR in non-reflew page +--]] +function KoptInterface:getOCRWord(doc, pageno, rect) + local ocrengine = "ocrengine" + if not Cache:check(ocrengine) then + local dummy = KOPTContext.new() + Cache:insert(ocrengine, OCREngine:new{ ocrengine = dummy }) + end + local hash = "ocrword|"..doc.file.."|"..pageno..rect.x..rect.y..rect.w..rect.h + local cached = Cache:check(hash) + if not cached then + local pgboxes_hash = "pgboxes|"..doc.file.."|"..pageno + local pgboxes_cached = Cache:check(pgboxes_hash) + local kc_hash = "kctx|"..doc.file.."|"..pageno + local kc_cashed = Cache:check(kc_hash) + if pgboxes_cached and kc_cashed then + local kc = kc_cashed.kctx local ok, word = pcall( kc.getTOCRWord, kc, rect.x, rect.y, rect.w, rect.h, diff --git a/frontend/document/pdfdocument.lua b/frontend/document/pdfdocument.lua index 78f633b1a..bda592979 100644 --- a/frontend/document/pdfdocument.lua +++ b/frontend/document/pdfdocument.lua @@ -44,25 +44,26 @@ function PdfDocument:unlock(password) return self:_readMetadata() end -function PdfDocument:getPageText(pageno) +function PdfDocument:getTextBoxes(pageno) if self.configurable.text_wrap == 1 then - return self.koptinterface:getPageText(self, pageno) + return self.koptinterface:getReflewTextBoxes(self, pageno) else local page = self._document:openPage(pageno) local text = page:getPageText() page:close() - return text + if not text or #text == 0 then + return self.koptinterface:getTextBoxes(self, pageno) + else + return text + end end end function PdfDocument:getOCRWord(pageno, rect) if self.configurable.text_wrap == 1 then - return self.koptinterface:getOCRWord(self, pageno, rect) + return self.koptinterface:getReflewOCRWord(self, pageno, rect) else - --local page = self._document:openPage(pageno) - --local word = page:getOCRWord(rect) - --page:close() - --return word + return self.koptinterface:getOCRWord(self, pageno, rect) end end diff --git a/frontend/ui/reader/readerhighlight.lua b/frontend/ui/reader/readerhighlight.lua index b108b68a6..33e428dda 100644 --- a/frontend/ui/reader/readerhighlight.lua +++ b/frontend/ui/reader/readerhighlight.lua @@ -54,17 +54,34 @@ end function ReaderHighlight:onHold(arg, ges) self.pos = self.view:screenToPageTransform(ges.pos) DEBUG("hold position in page", self.pos) - local text = self.ui.document:getPageText(self.pos.page) - --DEBUG("page text", text) + local text_boxes = self.ui.document:getTextBoxes(self.pos.page) + --DEBUG("page text", text_boxes) - if not text or #text == 0 then - DEBUG("no text extracted") + if not text_boxes or #text_boxes == 0 then + DEBUG("no text box detected") return true end - self.word_info = self:getWordFromText(text, self.pos) + self.word_info = self:getWordFromBoxes(text_boxes, self.pos) DEBUG("hold word info in page", self.word_info) if self.word_info then + -- if we extracted text directly + if self.word_info.word then + self.ui:handleEvent(Event:new("LookupWord", self.word_info.word)) + -- or we will do OCR + else + UIManager:scheduleIn(0.1, function() + local word_box = self.word_info.box + word_box.x = word_box.x - math.floor(word_box.h * 0.2) + word_box.y = word_box.y - math.floor(word_box.h * 0.4) + word_box.w = word_box.w + math.floor(word_box.h * 0.4) + word_box.h = word_box.h + math.floor(word_box.h * 0.6) + local word = self.ui.document:getOCRWord(self.pos.page, word_box) + DEBUG("OCRed word:", word) + self.ui:handleEvent(Event:new("LookupWord", word)) + end) + end + local screen_rect = self.view:pageToScreenTransform(self.pos.page, self.word_info.box) DEBUG("highlight word rect", screen_rect) if screen_rect then @@ -74,28 +91,12 @@ function ReaderHighlight:onHold(arg, ges) screen_rect.h = screen_rect.h + screen_rect.h * 0.4 self.view.highlight.rect = screen_rect UIManager:setDirty(self.dialog, "partial") - -- if we extracted text directly - if self.word_info.word then - self.ui:handleEvent(Event:new("LookupWord", self.word_info.word)) - -- or we will do OCR - else - UIManager:scheduleIn(0.1, function() - local word_box = self.word_info.box - word_box.x = word_box.x - math.floor(word_box.h * 0.2) - word_box.y = word_box.y - math.floor(word_box.h * 0.4) - word_box.w = word_box.w + math.floor(word_box.h * 0.4) - word_box.h = word_box.h + math.floor(word_box.h * 0.6) - local word = self.ui.document:getOCRWord(self.pos.page, word_box) - DEBUG("OCRed word:", word) - self.ui:handleEvent(Event:new("LookupWord", word)) - end) - end end end return true end -function ReaderHighlight:getWordFromText(text, pos) +function ReaderHighlight:getWordFromBoxes(boxes, pos) local function ges_inside(x0, y0, x1, y1) local x, y = pos.x, pos.y if x0 ~= nil and y0 ~= nil and x1 ~= nil and y1 ~= nil then @@ -106,12 +107,12 @@ function ReaderHighlight:getWordFromText(text, pos) return false end - for i = 1, #text do - local l = text[i] + for i = 1, #boxes do + local l = boxes[i] if ges_inside(l.x0, l.y0, l.x1, l.y1) then --DEBUG("line box", l.x0, l.y0, l.x1, l.y1) - for j = 1, #text[i] do - local w = text[i][j] + for j = 1, #boxes[i] do + local w = boxes[i][j] if ges_inside(w.x0, w.y0, w.x1, w.y1) then local box = Geom:new{ x = w.x0, y = w.y0,