From a3c4254809240812faf6749fa4fe069fbea4f510 Mon Sep 17 00:00:00 2001 From: Robert Date: Sun, 2 Apr 2017 16:17:49 +0200 Subject: [PATCH] Added util.fixUtf8 (#2704) * Remove invalid UTF-8 chars from OPDS * add unit tests --- frontend/ui/widget/opdsbrowser.lua | 23 +++++++++++----------- frontend/util.lua | 31 ++++++++++++++++++++++++++++-- spec/unit/util_spec.lua | 29 ++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+), 13 deletions(-) diff --git a/frontend/ui/widget/opdsbrowser.lua b/frontend/ui/widget/opdsbrowser.lua index ae5c83dd6..0095568ac 100644 --- a/frontend/ui/widget/opdsbrowser.lua +++ b/frontend/ui/widget/opdsbrowser.lua @@ -1,25 +1,25 @@ -local MultiInputDialog = require("ui/widget/multiinputdialog") local ButtonDialog = require("ui/widget/buttondialog") local ButtonDialogTitle = require("ui/widget/buttondialogtitle") +local Cache = require("cache") +local CacheItem = require("cacheitem") local InfoMessage = require("ui/widget/infomessage") local LoginDialog = require("ui/widget/logindialog") -local OPDSParser = require("ui/opdsparser") -local NetworkMgr = require("ui/network/manager") -local UIManager = require("ui/uimanager") -local CacheItem = require("cacheitem") local Menu = require("ui/widget/menu") +local MultiInputDialog = require("ui/widget/multiinputdialog") +local NetworkMgr = require("ui/network/manager") +local OPDSParser = require("ui/opdsparser") local Screen = require("device").screen -local url = require('socket.url') -local T = require("ffi/util").template -local Cache = require("cache") -local logger = require("logger") +local UIManager = require("ui/uimanager") local gettext = require("gettext") - -local socket = require('socket') local http = require('socket.http') local https = require('ssl.https') +local logger = require("logger") local ltn12 = require('ltn12') local mime = require('mime') +local socket = require('socket') +local url = require('socket.url') +local util = require("util") +local T = require("ffi/util").template local CatalogCacheItem = CacheItem:new{ size = 1024, -- fixed size for catalog item @@ -504,6 +504,7 @@ function OPDSBrowser:downloadFile(item, format, remote_url) local local_path = download_dir .. "/" .. item.author .. ' - ' .. item.title .. "." .. string.lower(format) logger.dbg("downloading file", local_path, "from", remote_url) + local_path = util.fixUtf8(local_path, "_") local parsed = url.parse(remote_url) http.TIMEOUT, https.TIMEOUT = 20, 20 local httpRequest = parsed.scheme == 'http' and http.request or https.request diff --git a/frontend/util.lua b/frontend/util.lua index 04dc65700..871c44032 100644 --- a/frontend/util.lua +++ b/frontend/util.lua @@ -257,13 +257,13 @@ end function util.replaceInvalidChars(str) if str then - return str:gsub('[\\,%/,:,%*,%?,%",%<,%>,%|]','_'):gsub("([\224-\244]+)",'_') + return str:gsub('[\\,%/,:,%*,%?,%",%<,%>,%|]','_') end end function util.replaceSlashChar(str) if str then - return str:gsub('%/','_'):gsub("([\224-\244]+)",'_') + return str:gsub('%/','_') end end @@ -299,4 +299,31 @@ function util.getMenuText(item) return text end +-- from http://notebook.kulchenko.com/programming/fixing-malformed-utf8-in-lua with modification +--- Replaces invalid UTF-8 characters with a replacement string. +---- @string str the string to be checked for invalid characters +---- @string replacement the string to replace invalid characters with +---- @treturn string valid UTF-8 +function util.fixUtf8(str, replacement) + local pos = 1 + local len = #str + while pos <= len do + if pos == str:find("[%z\1-\127]", pos) then pos = pos + 1 + elseif pos == str:find("[\194-\223][\128-\191]", pos) then pos = pos + 2 + elseif pos == str:find( "\224[\160-\191][\128-\191]", pos) + or pos == str:find("[\225-\236][\128-\191][\128-\191]", pos) + or pos == str:find( "\237[\128-\159][\128-\191]", pos) + or pos == str:find("[\238-\239][\128-\191][\128-\191]", pos) then pos = pos + 3 + elseif pos == str:find( "\240[\144-\191][\128-\191][\128-\191]", pos) + or pos == str:find("[\241-\243][\128-\191][\128-\191][\128-\191]", pos) + or pos == str:find( "\244[\128-\143][\128-\191][\128-\191]", pos) then pos = pos + 4 + else + str = str:sub(1, pos - 1) .. replacement .. str:sub(pos + 1) + pos = pos + #replacement + len = len + #replacement - 1 + end + end + return str +end + return util diff --git a/spec/unit/util_spec.lua b/spec/unit/util_spec.lua index 5b143e2f1..1edc85a53 100644 --- a/spec/unit/util_spec.lua +++ b/spec/unit/util_spec.lua @@ -222,4 +222,33 @@ describe("util module", function() test("", "", "") assert.are_same(util.splitFileNameSuffix("a.txt"), "a") end) + + it("should replace invalid UTF-8 characters with an underscore", function() + assert.is_equal(util.fixUtf8("\127 \128 \194\127 ", "_"), "\127 _ _\127 ") + end) + + it("should replace invalid UTF-8 characters with multiple characters", function() + assert.is_equal(util.fixUtf8("\127 \128 \194\127 ", "__"), "\127 __ __\127 ") + end) + + it("should replace invalid UTF-8 characters with empty char", function() + assert.is_equal(util.fixUtf8("\127 \128 \194\127 ", ""), "\127 \127 ") + end) + + it("should not replace valid UTF-8 � character", function() + assert.is_equal(util.fixUtf8("�valid � char �", "__"), "�valid � char �") + end) + + it("should not replace valid UTF-8 characters", function() + assert.is_equal(util.fixUtf8("\99 \244\129\130\190", "_"), "\99 \244\129\130\190") + end) + + it("should not replace valid UTF-8 characters Polish chars", function() + assert.is_equal(util.fixUtf8("Pójdźże źółć", "_"), "Pójdźże źółć") + end) + + it("should not replace valid UTF-8 characters German chars", function() + assert.is_equal(util.fixUtf8("glück schließen", "_"), "glück schließen") + end) + end)