From 6f1b70e5eb71fed5e1844ba4e39e69411b710d43 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Sat, 23 Oct 2021 21:12:38 +1100
Subject: [PATCH] util.utf8: improve CJK character detection

Previously the CJK character detection defined only characters in the
range U+4000..U+AFFF as "CJK characters". This excludes an incredibly
large number of CJK characters within the BMP, let alone the whole two
planes dedicated to rarer CJK characters (the SIP and TIP). As a result,
a very large number of Chinese, Japanese, and Korean characters were not
detected as being CJK characters.

While slightly less elegant-looking, it is far more accurate to compute
the codepoint from the utf8 character and then see if it falls within
one of the defined CJK blocks. This is not future-proof against future
CJK ideograph extensions in future Unicode versions, but there is no
real way to accurately predict such changes so this is the best we can
do without accidentally treating characters explicitily defined as being
non-CJK in Unicode as CJK.

While we're at it, copy Lua 5.3's utf8.charpattern constant definition
so that we can more easily write utf8 iterators with string.gmatch (at
least in the interim until there is a rework of utf8 handling in
KOReader and everything is rebuilt on top of utf8proc).

Some unit tests are added for Korean and Japanese text, and the existing
unit tests needed a minor adjustment to handle the fact that
isSplittable now correctly detects CJK punctuation as a character to
compare against the forbidden split rules.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 frontend/util.lua       | 52 ++++++++++++++++++++---
 spec/unit/util_spec.lua | 91 ++++++++++++++++++++++++++++++++++++++---
 2 files changed, 132 insertions(+), 11 deletions(-)

diff --git a/frontend/util.lua b/frontend/util.lua
index 80daa2929..c298a4f70 100644
--- a/frontend/util.lua
+++ b/frontend/util.lua
@@ -521,11 +521,16 @@ function util.lastIndexOf(string, ch)
     if i == nil then return -1 else return i - 1 end
 end
 
+--- Pattern which matches a single well-formed UTF-8 character, including
+--- theoretical >4-byte extensions.
+-- Taken from <https://www.lua.org/manual/5.4/manual.html#pdf-utf8.charpattern>
+util.UTF8_CHAR_PATTERN = '[%z\1-\127\194-\253][\128-\191]*'
+
 --- Reverse the individual greater-than-single-byte characters
 -- @string string to reverse
 -- Taken from <https://github.com/blitmap/lua-utf8-simple#utf8reverses>
 function util.utf8Reverse(text)
-    text = text:gsub('[%z\1-\127\194-\244][\128-\191]*', function (c) return #c > 1 and c:reverse() end)
+    text = text:gsub(util.UTF8_CHAR_PATTERN, function (c) return #c > 1 and c:reverse() end)
     return text:reverse()
 end
 
@@ -554,7 +559,7 @@ function util.splitToChars(text)
         --   characters directly, but only as a pair.
         local hi_surrogate
         local hi_surrogate_uchar
-        for uchar in string.gmatch(text, "([%z\1-\127\194-\244][\128-\191]*)") do
+        for uchar in text:gmatch(util.UTF8_CHAR_PATTERN) do
             charcode = BaseUtil.utf8charcode(uchar)
             -- (not sure why we need this prevcharcode check; we could get
             -- charcode=nil with invalid UTF-8, but should we then really
@@ -589,14 +594,47 @@ end
 ---- @string c
 ---- @treturn boolean true if CJK
 function util.isCJKChar(c)
-    return string.match(c, "[\228-\234][\128-\191].") == c
+    -- Smallest CJK codepoint is 0x1100 which requires at least 3 utf8 bytes to
+    -- encode (U+07FF is the largest codepoint that can be represented in 2
+    -- bytes with utf8). So if the character is shorter than 3 bytes it's
+    -- definitely not CJK and no need to decode it.
+    if #c < 3 then
+        return false
+    end
+    code = BaseUtil.utf8charcode(c)
+    -- The weird bracketing is intentional -- we use the lowest possible
+    -- codepoint as a shortcut so if the codepoint is below U+1100 we
+    -- immediately return false.
+    return -- BMP (Plane 0)
+            code >=  0x1100 and (code <=  0x11FF  or -- Hangul Jamo
+           (code >=  0x2E80 and  code <=  0x9FFF) or -- Numerous CJK Blocks (NB: has some gaps)
+           (code >=  0xA960 and  code <=  0xA97F) or -- Hangul Jamo Extended-A
+           (code >=  0xAC00 and  code <=  0xD7AF) or -- Hangul Syllables
+           (code >=  0xD7B0 and  code <=  0xD7FF) or -- Hangul Jame Extended-B
+           (code >=  0xF900 and  code <=  0xFAFF) or -- CJK Compatibility Ideographs
+           (code >=  0xFE30 and  code <=  0xFE4F) or -- CJK Compatibility Forms
+           (code >=  0xFF00 and  code <=  0xFFEF) or -- Halfwidth and Fullwidth Forms
+           -- SIP (Plane 2)
+           (code >= 0x20000 and  code <= 0x2A6DF) or -- CJK Unified Ideographs Extension B
+           (code >= 0x2A700 and  code <= 0x2B73F) or -- CJK Unified Ideographs Extension C
+           (code >= 0x2B740 and  code <= 0x2B81F) or -- CJK Unified Ideographs Extension D
+           (code >= 0x2B820 and  code <= 0x2CEAF) or -- CJK Unified Ideographs Extension E
+           (code >= 0x2CEB0 and  code <= 0x2EBEF) or -- CJK Unified Ideographs Extension F
+           (code >= 0x2F800 and  code <= 0x2FA1F) or -- CJK Compatibility Ideographs Supplement
+           -- TIP (Plane 3)
+           (code >= 0x30000 and  code <= 0x3134F))   -- CJK Unified Ideographs Extension G
 end
 
 --- Tests whether str contains CJK characters
 ---- @string str
 ---- @treturn boolean true if CJK
 function util.hasCJKChar(str)
-    return string.match(str, "[\228-\234][\128-\191].") ~= nil
+    for c in str:gmatch(util.UTF8_CHAR_PATTERN) do
+        if util.isCJKChar(c) then
+            return true
+        end
+    end
+    return false
 end
 
 --- Split texts into a list of words, spaces and punctuation marks.
@@ -607,8 +645,10 @@ function util.splitToWords(text)
     for word in util.gsplit(text, "[%s%p]+", true) do
         -- if space split word contains CJK characters
         if util.hasCJKChar(word) then
-            -- split with CJK characters
-            for char in util.gsplit(word, "[\228-\234\192-\255][\128-\191]+", true) do
+            -- split all non-ASCII characters separately (FIXME ideally we
+            -- would split only the CJK characters, but you cannot define CJK
+            -- characters trivially with a byte-only Lua pattern).
+            for char in util.gsplit(word, "[\192-\255][\128-\191]+", true) do
                 table.insert(wlist, char)
             end
         else
diff --git a/spec/unit/util_spec.lua b/spec/unit/util_spec.lua
index e2477ae95..34e38d96e 100644
--- a/spec/unit/util_spec.lua
+++ b/spec/unit/util_spec.lua
@@ -88,6 +88,23 @@ describe("util module", function()
                 "彩","虹","是","通","过","太","阳","光","的","折","射","引","起","的","。",
             }, words)
         end)
+        it("should split Japanese words", function()
+            local words = util.splitToWords("色は匂へど散りぬるを我が世誰ぞ常ならむ")
+            assert.are_same({
+                "色","は","匂","へ","ど","散","り","ぬ","る","を",
+                "我","が","世","誰","ぞ","常","な","ら","む",
+            }, words)
+        end)
+        it("should split Korean words", function()
+			-- Technically splitting on spaces is correct but we treat Korean
+			-- as if it were any other CJK text.
+            local words = util.splitToWords("대한민국의 국기는 대한민국 국기법에 따라 태극기")
+            assert.are_same({
+                "대","한","민","국","의"," ","국","기","는"," ",
+                "대","한","민","국"," ","국","기","법","에"," ",
+                "따","라"," ","태","극","기",
+            }, words)
+        end)
         it("should split words of multilingual text", function()
             local words = util.splitToWords("BBC纪录片")
             assert.are_same({"BBC", "纪", "录", "片"}, words)
@@ -108,7 +125,7 @@ describe("util module", function()
                     table.insert(table_of_words, word)
                     word = ""
                 end
-                if i == #table_chars then table.insert(table_of_words, word) end
+                if i == #table_chars and word ~= "" then table.insert(table_of_words, word) end
             end
             assert.are_same({
                 "Pójdźże, ",
@@ -121,7 +138,7 @@ describe("util module", function()
                 "gavilán",
             }, table_of_words)
         end)
-        it("should split text to line - CJK", function()
+        it("should split text to line - CJK Chinese", function()
             local text = "彩虹是通过太阳光的折射引起的。"
             local word = ""
             local table_of_words = {}
@@ -134,12 +151,76 @@ describe("util module", function()
                     table.insert(table_of_words, word)
                     word = ""
                 end
-                if i == #table_chars then table.insert(table_of_words, word) end
+                if i == #table_chars and word ~= "" then table.insert(table_of_words, word) end
             end
             assert.are_same({
                 "彩","虹","是","通","过","太","阳","光","的","折","射","引","起","的","。",
             }, table_of_words)
         end)
+        it("should split text to line - CJK Japanese", function()
+            local text = "色は匂へど散りぬるを我が世誰ぞ常ならむ"
+            local word = ""
+            local table_of_words = {}
+            local c
+            local table_chars = util.splitToChars(text)
+            for i = 1, #table_chars  do
+                c = table_chars[i]
+                word = word .. c
+                if util.isSplittable(c) then
+                    table.insert(table_of_words, word)
+                    word = ""
+                end
+                if i == #table_chars and word ~= "" then table.insert(table_of_words, word) end
+            end
+            assert.are_same({
+                "色","は","匂","へ","ど","散","り","ぬ","る","を",
+                "我","が","世","誰","ぞ","常","な","ら","む",
+            }, table_of_words)
+        end)
+        it("should split text to line - CJK Korean", function()
+            local text = "대한민국의 국기는 대한민국 국기법에 따라 태극기"
+            local word = ""
+            local table_of_words = {}
+            local c
+            local table_chars = util.splitToChars(text)
+            for i = 1, #table_chars  do
+                c = table_chars[i]
+                word = word .. c
+                if util.isSplittable(c) then
+                    table.insert(table_of_words, word)
+                    word = ""
+                end
+                if i == #table_chars and word ~= "" then table.insert(table_of_words, word) end
+            end
+            assert.are_same({
+                "대","한","민","국","의"," ","국","기","는"," ",
+                "대","한","민","국"," ","국","기","법","에"," ",
+                "따","라"," ","태","극","기",
+            }, table_of_words)
+        end)
+        it("should split text to line - mixed CJK and latin", function()
+            local text = "This is Russian: русский язык, Chinese: 汉语, Japanese: 日本語、 Korean: 한국어。"
+            local word = ""
+            local table_of_words = {}
+            local c
+            local table_chars = util.splitToChars(text)
+            for i = 1, #table_chars  do
+                c = table_chars[i]
+                word = word .. c
+                if util.isSplittable(c) then
+                    table.insert(table_of_words, word)
+                    word = ""
+                end
+                if i == #table_chars and word ~= "" then table.insert(table_of_words, word) end
+            end
+            assert.are_same({
+                "This ", "is ",
+                "Russian: ", "русский ", "язык, ",
+                "Chinese: ", "汉","语",", ",
+                "Japanese: ", "日","本","語","、", " ",
+                "Korean: ", "한","국","어","。",
+            }, table_of_words)
+        end)
         it("should split text to line with next_c - unicode", function()
             local text = "Ce test : 1) est très simple ; 2 ) simple comme ( 2/2 ) > 50 % ? ok."
             local word = ""
@@ -154,7 +235,7 @@ describe("util module", function()
                     table.insert(table_of_words, word)
                     word = ""
                 end
-                if i == #table_chars then table.insert(table_of_words, word) end
+                if i == #table_chars and word ~= "" then table.insert(table_of_words, word) end
             end
             assert.are_same({
                 "Ce ",
@@ -187,7 +268,7 @@ describe("util module", function()
                     table.insert(table_of_words, word)
                     word = ""
                 end
-                if i == #table_chars then table.insert(table_of_words, word) end
+                if i == #table_chars and word ~= "" then table.insert(table_of_words, word) end
             end
             assert.are_same({
                 "Ce ",