split accient greek words with spacing character

This should fix #1705.
pull/2114/head
chrox 8 years ago
parent ffd01b3345
commit 71bf9efc7c

@ -19,6 +19,7 @@ local RenderText = require("ui/rendertext")
local Screen = require("device").screen
local Geom = require("ui/geometry")
local util = require("util")
local DEBUG= require("dbg")
local TextBoxWidget = Widget:new{
text = nil,
@ -282,7 +283,7 @@ function TextBoxWidget:onHoldWord(callback, ges)
local x, y = ges.pos.x - self.dimen.x, ges.pos.y - self.dimen.y
local line_num = math.ceil(y / self.line_height_px)
local line = self.vertical_string_list[line_num]
DEBUG("holding on line", line)
if line then
local char_start = line.offset
local char_end -- char_end is non-inclusive
@ -304,10 +305,10 @@ function TextBoxWidget:onHoldWord(callback, ges)
-- now find which word the character is in
local words = util.splitToWords(line.text)
local probe_idx = char_start
for _,w in ipairs(words) do
for _, w in ipairs(words) do
-- +1 for word separtor
probe_idx = probe_idx + string.len(w)
if idx <= probe_idx then
probe_idx = probe_idx + #util.splitToChars(w)
if idx <= probe_idx - 1 then
callback(w)
return
end

@ -121,10 +121,15 @@ end
---- @string text text to split
---- @treturn table list of words, spaces and punctuations
function util.splitToWords(text)
-- TODO: write test
local wlist = {}
for words in text:gmatch("[\32-\127\192-\255]+[\128-\191]*") do
for word in util.gsplit(words, "[%s%p]+", true) do
for word in util.gsplit(text, "[%s%p]+", true) do
-- if space splitted word contains CJK characters
if word:match("[\228-\234][\128-\191]+") then
-- split with CJK characters
for char in util.gsplit(word, "[\228-\234\192-\255][\128-\191]+", true) do
table.insert(wlist, char)
end
else
table.insert(wlist, word)
end
end

@ -52,4 +52,28 @@ describe("util module", function()
"five",
})
end)
it("should split ancient greek words", function()
local words = util.splitToWords("Λαρισαῖος Λευκοθέα Λιγυαστάδης.")
assert.are_same(words, {
"Λαρισαῖος",
" ",
"Λευκοθέα",
" ",
"Λιγυαστάδης",
"."
})
end)
it("should split Chinese words", function()
local words = util.splitToWords("彩虹是通过太阳光的折射引起的。")
assert.are_same(words, {
"","","","","","","","","","","","","","","",
})
end)
it("should split words of multilingual text", function()
local words = util.splitToWords("BBC纪录片")
assert.are_same(words, {"BBC", "", "", ""})
end)
end)

Loading…
Cancel
Save