From a82bafc63cdb41ca7266aef210fb8fddad87c31b Mon Sep 17 00:00:00 2001 From: Hans-Werner Hilse Date: Fri, 28 Nov 2014 14:48:44 +0000 Subject: [PATCH] fix handling of invalid UTF8 sequences external data (and in bad cases our own) can contain invalid byte sequences in UTF8 strings. A prominent example are file names. There was a 1-off bug in calculating the allowed length for multibyte chars, and the iterator was a bit too greedy when stumbling upon invalid sequences, returning a single "invalid" char for a sequence up to the point where it became invalid in calculation. Now, we present one invalid char for the first byte of that sequence and then check for a valid char starting with the next byte. --- frontend/ui/rendertext.lua | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/frontend/ui/rendertext.lua b/frontend/ui/rendertext.lua index 7747aa043..140e3dfed 100644 --- a/frontend/ui/rendertext.lua +++ b/frontend/ui/rendertext.lua @@ -44,7 +44,7 @@ local function utf8Chars(input) else return pos+1, 0xFFFD, "\xFF\xFD" end - if string.len(input) < (pos + bytes_left - 1) then + if string.len(input) < (pos + bytes_left) then return pos+1, 0xFFFD, "\xFF\xFD" end for i = pos+1, pos + bytes_left do @@ -52,7 +52,9 @@ local function utf8Chars(input) if bit.band(value, 0xC0) == 0x80 then glyph = bit.bor(bit.lshift(glyph, 6), bit.band(value, 0x3F)) else - return i+1, 0xFFFD, "\xFF\xFD" + -- invalid UTF8 continuation - don't be greedy, just skip + -- the initial char of the sequence. + return pos+1, 0xFFFD, "\xFF\xFD" end end -- TODO: check for valid ranges here!