From a4720b44cda1754cac3cb57b21fc045131a46401 Mon Sep 17 00:00:00 2001 From: poire-z Date: Mon, 3 Jul 2023 19:49:22 +0200 Subject: [PATCH] Text search: various Kopt search fixes - Properly parse input text for words (the previous code wasn't working with Greek letters) - With multiple words search, don't allow "substring matching" for words in the middle - Remove support for Lua pattens, so to get proper substring matching (as we have with cre text search) --- frontend/document/koptinterface.lua | 42 ++++++++++++++++++++++------- spec/unit/readersearch_spec.lua | 6 ----- 2 files changed, 33 insertions(+), 15 deletions(-) diff --git a/frontend/document/koptinterface.lua b/frontend/document/koptinterface.lua index b8be38da5..e5fef4ac8 100644 --- a/frontend/document/koptinterface.lua +++ b/frontend/document/koptinterface.lua @@ -13,6 +13,7 @@ local Geom = require("ui/geometry") local KOPTContext = require("ffi/koptcontext") local Persist = require("persist") local TileCacheItem = require("document/tilecacheitem") +local Utf8Proc = require("ffi/utf8proc") local logger = require("logger") local util = require("util") @@ -1337,18 +1338,22 @@ end local function all_matches(boxes, pattern, caseInsensitive) -- pattern list of single words local plist = {} - -- split utf-8 characters - for words in pattern:gmatch("[\32-\127\192-\255]+[\128-\191]*") do - -- split space seperated words - for word in words:gmatch("[^%s]+") do - table.insert(plist, caseInsensitive and word:lower() or word) + -- (as in util.splitToWords(), but only splitting on spaces, keeping punctuations) + for word in util.gsplit(pattern, "%s+") do + if util.hasCJKChar(word) then + for char in util.gsplit(word, "[\192-\255][\128-\191]+", true) do + table.insert(plist, caseInsensitive and Utf8Proc.lowercase(util.fixUtf8(char, "?")) or char) + end + else + table.insert(plist, caseInsensitive and Utf8Proc.lowercase(util.fixUtf8(word, "?")) or word) end end + local pnb = #plist -- return mached word indices from index i, j local function match(i, j) local pindex = 1 local matched_indices = {} - if #plist == 0 then return end + if pnb == 0 then return end while true do if #boxes[i] < j then j = j - #boxes[i] @@ -1356,10 +1361,27 @@ local function all_matches(boxes, pattern, caseInsensitive) end if i > #boxes then break end local box = boxes[i][j] - local word = caseInsensitive and box.word:lower() or box.word - if word:match(plist[pindex]) then + local word = caseInsensitive and Utf8Proc.lowercase(util.fixUtf8(box.word, "?")) or box.word + local pword = plist[pindex] + local matched + if pnb == 1 then -- single word in plist + matched = word:find(pword, 1, true) + else -- multiple words in plist + if pindex == 1 then + -- first word of query should match at end of a word from the document + matched = word:sub(-#pword) == pword + elseif pindex == pnb then + -- last word of query should match at start of the word from the document + matched = word:sub(1, #pword) == pword + else + -- middle words in query should match exactly the word from the document + matched = word == pword + end + end + if matched then table.insert(matched_indices, {i, j}) - if pindex == #plist then + if pindex == pnb then + -- all words in plist iterated, all matched return matched_indices else j = j + 1 @@ -1370,6 +1392,8 @@ local function all_matches(boxes, pattern, caseInsensitive) end end end + -- Note that this returns a full word box, even if what matches + -- is only a substring of a word box. return coroutine.wrap(function() for i, line in ipairs(boxes) do for j, box in ipairs(line) do diff --git a/spec/unit/readersearch_spec.lua b/spec/unit/readersearch_spec.lua index dedbfbb9a..bf267e68f 100644 --- a/spec/unit/readersearch_spec.lua +++ b/spec/unit/readersearch_spec.lua @@ -151,12 +151,6 @@ describe("Readersearch module", function() it("should match whole phrase in one page", function() assert.are.equal(1*3, #doc.koptinterface:findAllMatches(doc, "mean that the", true, 20)) end) - it("should match with lua pattern", function() - assert.are.equal(7*1, #doc.koptinterface:findAllMatches(doc, "chapter", true, 30)) - assert.are.equal(3*2, #doc.koptinterface:findAllMatches(doc, "chapter %d", true, 30)) - assert.are.equal(2*2, #doc.koptinterface:findAllMatches(doc, "chapter %d%d", true, 30)) - assert.are.equal(0*2, #doc.koptinterface:findAllMatches(doc, "chapter %d%d%d", true, 30)) - end) it("should not match empty string", function() assert.are.equal(0, #doc.koptinterface:findAllMatches(doc, "", true, 1)) end)