add document language option in config dialog

Document language setting is used for better text extraction and OCR.
pull/163/head
chrox 11 years ago
parent 167e6ca0aa
commit 1808c7e5c3

@ -41,6 +41,10 @@ DKOPTREADER_CONFIG_JUSTIFICATION = -1 -- -1 = auto, 0 = left, 1 = center, 2 = ri
DKOPTREADER_CONFIG_MAX_COLUMNS = 2 -- range from 1 to 4
DKOPTREADER_CONFIG_CONTRAST = 1.0 -- range from 0.2 to 2.0
-- document languages for OCR
DKOPTREADER_CONFIG_DOC_LANGS_TEXT = {"English", "Chinese_S", "Chinese_T"}
DKOPTREADER_CONFIG_DOC_LANGS_CODE = {"eng", "chi_sim", "chi_tra"}
DKOPTREADER_CONFIG_DOC_DEFAULT_LANG_CODE = "eng"
-- ####################################################################
-- following features are not supported right now

@ -62,6 +62,7 @@ function KoptInterface:createContext(doc, pageno, bbox)
kc:setDefectSize(doc.configurable.defect_size)
kc:setLineSpacing(doc.configurable.line_spacing)
kc:setWordSpacing(doc.configurable.word_spacing)
kc:setLanguage(doc.configurable.doc_language)
kc:setBBox(bbox.x0, bbox.y0, bbox.x1, bbox.y1)
if Dbg.is_on then kc:setDebug() end
return kc
@ -129,6 +130,13 @@ function KoptInterface:getReflewTextBoxes(doc, pageno)
local cached = Cache:check(kctx_hash)
if cached then
local kc = self:waitForContext(cached.kctx)
--kc:setDebug()
local lang = doc.configurable.doc_language
if lang == "chi_sim" or lang == "chi_tra" or
lang == "jpn" or lang == "kor" then
kc:setCJKChar()
end
kc:setLanguage(lang)
local fullwidth, fullheight = kc:getPageDim()
local boxes = kc:getWordBoxes(0, 0, fullwidth, fullheight)
Cache:insert(hash, CacheItem:new{ rfpgboxes = boxes })
@ -146,6 +154,12 @@ function KoptInterface:getTextBoxes(doc, pageno)
local kc_hash = "kctx|"..doc.file.."|"..pageno
local kc = KOPTContext.new()
kc:setDebug()
local lang = doc.configurable.doc_language
if lang == "chi_sim" or lang == "chi_tra" or
lang == "jpn" or lang == "kor" then
kc:setCJKChar()
end
kc:setLanguage(lang)
local page = doc._document:openPage(pageno)
page:getPagePix(kc)
local fullwidth, fullheight = kc:getPageDim()
@ -167,6 +181,7 @@ function KoptInterface:getReflewOCRWord(doc, pageno, rect)
local dummy = KOPTContext.new()
Cache:insert(ocrengine, OCREngine:new{ ocrengine = dummy })
end
self.ocr_lang = doc.configurable.doc_language
local bbox = doc:getPageBBox(pageno)
local context_hash = self:getContextHash(doc, pageno, bbox)
local hash = "rfocrword|"..context_hash..rect.x..rect.y..rect.w..rect.h
@ -197,6 +212,7 @@ function KoptInterface:getOCRWord(doc, pageno, rect)
local dummy = KOPTContext.new()
Cache:insert(ocrengine, OCREngine:new{ ocrengine = dummy })
end
self.ocr_lang = doc.configurable.doc_language
local hash = "ocrword|"..doc.file.."|"..pageno..rect.x..rect.y..rect.w..rect.h
local cached = Cache:check(hash)
if not cached then

@ -160,6 +160,13 @@ KoptOptions = {
},
}
},
{
name="doc_language",
name_text = DOC_LANG_STR,
toggle = DKOPTREADER_CONFIG_DOC_LANGS_TEXT,
values = DKOPTREADER_CONFIG_DOC_LANGS_CODE,
default_value = DKOPTREADER_CONFIG_DOC_DEFAULT_LANG_CODE,
},
{
name="screen_rotation",
name_text = VERTICAL_TEXT_STR,
@ -195,6 +202,7 @@ KoptOptions = {
toggle = {ZERO_DEG_STR, FIVE_DEG_STR, TEN_DEG_STR},
values = {0, 5, 10},
default_value = DKOPTREADER_CONFIG_AUTO_STRAIGHTEN,
show = false,
},
{
name = "detect_indent",

@ -10,6 +10,7 @@ TEXT_ALIGN_STR = _("Text Align")
FONTSIZE_FINE_TUNING_STR = _("Fine Tuning")
CONTRAST_STR = _("Contrast")
REFLOW_STR = _("Reflow")
DOC_LANG_STR = _("Document Language")
VERTICAL_TEXT_STR = _("Vertical Text")
WORD_GAP_STR = _("Word Gap")
DEFECT_SIZE_STR = _("Defect Size")

@ -6,7 +6,8 @@ function Configurable:hash(sep)
local hash = ""
local excluded = {multi_threads = true,}
for key,value in pairs(self) do
if type(value) == "number" and not excluded[key] then
if type(value) == "number" or type(value) == "string"
and not excluded[key] then
hash = hash..sep..value
end
end
@ -28,8 +29,9 @@ end
function Configurable:loadSettings(settings, prefix)
for key,value in pairs(self) do
if type(value) == "number" or type(value) == "table" then
saved_value = settings:readSetting(prefix..key)
if type(value) == "number" or type(value) == "string"
or type(value) == "table" then
local saved_value = settings:readSetting(prefix..key)
self[key] = (saved_value == nil) and self[key] or saved_value
--Debug("Configurable:loadSettings", "key", key, "saved value", saved_value,"Configurable.key", self[key])
end
@ -39,7 +41,8 @@ end
function Configurable:saveSettings(settings, prefix)
for key,value in pairs(self) do
if type(value) == "number" or type(value) == "table" then
if type(value) == "number" or type(value) == "string"
or type(value) == "table" then
settings:saveSetting(prefix..key, value)
end
end

@ -204,6 +204,16 @@ function ConfigOption:init()
end
-- make current index according to configurable table
local current_item = nil
local function value_diff(val1, val2, name)
if type(val1) ~= type(val2) then
error("different data types in option", name)
end
if type(val1) == "number" then
return math.abs(val1 - val2)
elseif type(val1) == "string" then
return val1 == val2 and 0 or 1
end
end
if self.options[c].name then
if self.options[c].values then
-- check if current value is stored in configurable or calculated in runtime
@ -211,18 +221,18 @@ function ConfigOption:init()
or self.config.configurable[self.options[c].name]
local min_diff = nil
if type(val) == "table" then
min_diff = math.abs(val[1] - self.options[c].values[1][1])
min_diff = value_diff(val[1], self.options[c].values[1][1])
else
min_diff = math.abs(val - self.options[c].values[1])
min_diff = value_diff(val, self.options[c].values[1])
end
local diff = nil
for index, val_ in pairs(self.options[c].values) do
local diff = nil
if type(val) == "table" then
diff = math.abs(val[1] - val_[1])
diff = value_diff(val[1], val_[1])
else
diff = math.abs(val - val_)
diff = value_diff(val, val_)
end
if val == val_ then
current_item = index

@ -1 +1 @@
Subproject commit 7de326340ee639c79957f5c4e80c47116c0849eb
Subproject commit d45c0ac4b1f4801e193d0bc9a6c3759989dec7c2
Loading…
Cancel
Save