From a82bafc63cdb41ca7266aef210fb8fddad87c31b Mon Sep 17 00:00:00 2001
From: Hans-Werner Hilse <hwhilse@gmail.com>
Date: Fri, 28 Nov 2014 14:48:44 +0000
Subject: [PATCH] fix handling of invalid UTF8 sequences

external data (and in bad cases our own) can contain invalid byte
sequences in UTF8 strings. A prominent example are file names.
There was a 1-off bug in calculating the allowed length for multibyte
chars, and the iterator was a bit too greedy when stumbling upon
invalid sequences, returning a single "invalid" char for a sequence
up to the point where it became invalid in calculation. Now, we present
one invalid char for the first byte of that sequence and then check
for a valid char starting with the next byte.
---
 frontend/ui/rendertext.lua | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/frontend/ui/rendertext.lua b/frontend/ui/rendertext.lua
index 7747aa043..140e3dfed 100644
--- a/frontend/ui/rendertext.lua
+++ b/frontend/ui/rendertext.lua
@@ -44,7 +44,7 @@ local function utf8Chars(input)
             else
                 return pos+1, 0xFFFD, "\xFF\xFD"
             end
-            if string.len(input) < (pos + bytes_left - 1) then
+            if string.len(input) < (pos + bytes_left) then
                 return pos+1, 0xFFFD, "\xFF\xFD"
             end
             for i = pos+1, pos + bytes_left do
@@ -52,7 +52,9 @@ local function utf8Chars(input)
                 if bit.band(value, 0xC0) == 0x80 then
                     glyph = bit.bor(bit.lshift(glyph, 6), bit.band(value, 0x3F))
                 else
-                    return i+1, 0xFFFD, "\xFF\xFD"
+                    -- invalid UTF8 continuation - don't be greedy, just skip
+                    -- the initial char of the sequence.
+                    return pos+1, 0xFFFD, "\xFF\xFD"
                 end
             end
             -- TODO: check for valid ranges here!