From 71bf9efc7c44caf3ba4ba84e648686cc0638b6cf Mon Sep 17 00:00:00 2001 From: chrox Date: Tue, 28 Jun 2016 23:50:21 +0800 Subject: [PATCH] split accient greek words with spacing character This should fix #1705. --- frontend/ui/widget/textboxwidget.lua | 9 +++++---- frontend/util.lua | 11 ++++++++--- spec/unit/util_spec.lua | 24 ++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 7 deletions(-) diff --git a/frontend/ui/widget/textboxwidget.lua b/frontend/ui/widget/textboxwidget.lua index 066117ce2..9a8170dd0 100644 --- a/frontend/ui/widget/textboxwidget.lua +++ b/frontend/ui/widget/textboxwidget.lua @@ -19,6 +19,7 @@ local RenderText = require("ui/rendertext") local Screen = require("device").screen local Geom = require("ui/geometry") local util = require("util") +local DEBUG= require("dbg") local TextBoxWidget = Widget:new{ text = nil, @@ -282,7 +283,7 @@ function TextBoxWidget:onHoldWord(callback, ges) local x, y = ges.pos.x - self.dimen.x, ges.pos.y - self.dimen.y local line_num = math.ceil(y / self.line_height_px) local line = self.vertical_string_list[line_num] - + DEBUG("holding on line", line) if line then local char_start = line.offset local char_end -- char_end is non-inclusive @@ -304,10 +305,10 @@ function TextBoxWidget:onHoldWord(callback, ges) -- now find which word the character is in local words = util.splitToWords(line.text) local probe_idx = char_start - for _,w in ipairs(words) do + for _, w in ipairs(words) do -- +1 for word separtor - probe_idx = probe_idx + string.len(w) - if idx <= probe_idx then + probe_idx = probe_idx + #util.splitToChars(w) + if idx <= probe_idx - 1 then callback(w) return end diff --git a/frontend/util.lua b/frontend/util.lua index ac50a3824..b8c73ba2f 100644 --- a/frontend/util.lua +++ b/frontend/util.lua @@ -121,10 +121,15 @@ end ---- @string text text to split ---- @treturn table list of words, spaces and punctuations function util.splitToWords(text) - -- TODO: write test local wlist = {} - for words in text:gmatch("[\32-\127\192-\255]+[\128-\191]*") do - for word in util.gsplit(words, "[%s%p]+", true) do + for word in util.gsplit(text, "[%s%p]+", true) do + -- if space splitted word contains CJK characters + if word:match("[\228-\234][\128-\191]+") then + -- split with CJK characters + for char in util.gsplit(word, "[\228-\234\192-\255][\128-\191]+", true) do + table.insert(wlist, char) + end + else table.insert(wlist, word) end end diff --git a/spec/unit/util_spec.lua b/spec/unit/util_spec.lua index ca1a2b7e4..de1918a5a 100644 --- a/spec/unit/util_spec.lua +++ b/spec/unit/util_spec.lua @@ -52,4 +52,28 @@ describe("util module", function() "five", }) end) + + it("should split ancient greek words", function() + local words = util.splitToWords("Λαρισαῖος Λευκοθέα Λιγυαστάδης.") + assert.are_same(words, { + "Λαρισαῖος", + " ", + "Λευκοθέα", + " ", + "Λιγυαστάδης", + "." + }) + end) + + it("should split Chinese words", function() + local words = util.splitToWords("彩虹是通过太阳光的折射引起的。") + assert.are_same(words, { + "彩","虹","是","通","过","太","阳","光","的","折","射","引","起","的","。", + }) + end) + + it("should split words of multilingual text", function() + local words = util.splitToWords("BBC纪录片") + assert.are_same(words, {"BBC", "纪", "录", "片"}) + end) end)