From 71bf9efc7c44caf3ba4ba84e648686cc0638b6cf Mon Sep 17 00:00:00 2001
From: chrox <chrox.huang@gmail.com>
Date: Tue, 28 Jun 2016 23:50:21 +0800
Subject: [PATCH] split accient greek words with spacing character This should
 fix #1705.

---
 frontend/ui/widget/textboxwidget.lua |  9 +++++----
 frontend/util.lua                    | 11 ++++++++---
 spec/unit/util_spec.lua              | 24 ++++++++++++++++++++++++
 3 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/frontend/ui/widget/textboxwidget.lua b/frontend/ui/widget/textboxwidget.lua
index 066117ce2..9a8170dd0 100644
--- a/frontend/ui/widget/textboxwidget.lua
+++ b/frontend/ui/widget/textboxwidget.lua
@@ -19,6 +19,7 @@ local RenderText = require("ui/rendertext")
 local Screen = require("device").screen
 local Geom = require("ui/geometry")
 local util = require("util")
+local DEBUG= require("dbg")
 
 local TextBoxWidget = Widget:new{
     text = nil,
@@ -282,7 +283,7 @@ function TextBoxWidget:onHoldWord(callback, ges)
     local x, y = ges.pos.x - self.dimen.x, ges.pos.y - self.dimen.y
     local line_num = math.ceil(y / self.line_height_px)
     local line = self.vertical_string_list[line_num]
-
+    DEBUG("holding on line", line)
     if line then
         local char_start = line.offset
         local char_end  -- char_end is non-inclusive
@@ -304,10 +305,10 @@ function TextBoxWidget:onHoldWord(callback, ges)
                 -- now find which word the character is in
                 local words = util.splitToWords(line.text)
                 local probe_idx = char_start
-                for _,w in ipairs(words) do
+                for _, w in ipairs(words) do
                     -- +1 for word separtor
-                    probe_idx = probe_idx + string.len(w)
-                    if idx <= probe_idx then
+                    probe_idx = probe_idx + #util.splitToChars(w)
+                    if idx <= probe_idx - 1 then
                         callback(w)
                         return
                     end
diff --git a/frontend/util.lua b/frontend/util.lua
index ac50a3824..b8c73ba2f 100644
--- a/frontend/util.lua
+++ b/frontend/util.lua
@@ -121,10 +121,15 @@ end
 ---- @string text text to split
 ---- @treturn table list of words, spaces and punctuations
 function util.splitToWords(text)
-    -- TODO: write test
     local wlist = {}
-    for words in text:gmatch("[\32-\127\192-\255]+[\128-\191]*") do
-        for word in util.gsplit(words, "[%s%p]+", true) do
+    for word in util.gsplit(text, "[%s%p]+", true) do
+        -- if space splitted word contains CJK characters
+        if word:match("[\228-\234][\128-\191]+") then
+            -- split with CJK characters
+            for char in util.gsplit(word, "[\228-\234\192-\255][\128-\191]+", true) do
+                table.insert(wlist, char)
+            end
+        else
             table.insert(wlist, word)
         end
     end
diff --git a/spec/unit/util_spec.lua b/spec/unit/util_spec.lua
index ca1a2b7e4..de1918a5a 100644
--- a/spec/unit/util_spec.lua
+++ b/spec/unit/util_spec.lua
@@ -52,4 +52,28 @@ describe("util module", function()
             "five",
         })
     end)
+
+    it("should split ancient greek words", function()
+        local words = util.splitToWords("Λαρισαῖος Λευκοθέα Λιγυαστάδης.")
+        assert.are_same(words, {
+            "Λαρισαῖος",
+            " ",
+            "Λευκοθέα",
+            " ",
+            "Λιγυαστάδης",
+            "."
+        })
+    end)
+
+    it("should split Chinese words", function()
+        local words = util.splitToWords("彩虹是通过太阳光的折射引起的。")
+        assert.are_same(words, {
+            "彩","虹","是","通","过","太","阳","光","的","折","射","引","起","的","。",
+        })
+    end)
+
+    it("should split words of multilingual text", function()
+        local words = util.splitToWords("BBC纪录片")
+        assert.are_same(words, {"BBC", "纪", "录", "片"})
+    end)
 end)