Added util.fixUtf8 (#2704)

* Remove invalid UTF-8 chars from OPDS
* add unit tests
pull/2711/head
Robert 7 years ago committed by Frans de Jonge
parent c73a1c1fcf
commit a3c4254809

@ -1,25 +1,25 @@
local MultiInputDialog = require("ui/widget/multiinputdialog")
local ButtonDialog = require("ui/widget/buttondialog")
local ButtonDialogTitle = require("ui/widget/buttondialogtitle")
local Cache = require("cache")
local CacheItem = require("cacheitem")
local InfoMessage = require("ui/widget/infomessage")
local LoginDialog = require("ui/widget/logindialog")
local OPDSParser = require("ui/opdsparser")
local NetworkMgr = require("ui/network/manager")
local UIManager = require("ui/uimanager")
local CacheItem = require("cacheitem")
local Menu = require("ui/widget/menu")
local MultiInputDialog = require("ui/widget/multiinputdialog")
local NetworkMgr = require("ui/network/manager")
local OPDSParser = require("ui/opdsparser")
local Screen = require("device").screen
local url = require('socket.url')
local T = require("ffi/util").template
local Cache = require("cache")
local logger = require("logger")
local UIManager = require("ui/uimanager")
local gettext = require("gettext")
local socket = require('socket')
local http = require('socket.http')
local https = require('ssl.https')
local logger = require("logger")
local ltn12 = require('ltn12')
local mime = require('mime')
local socket = require('socket')
local url = require('socket.url')
local util = require("util")
local T = require("ffi/util").template
local CatalogCacheItem = CacheItem:new{
size = 1024, -- fixed size for catalog item
@ -504,6 +504,7 @@ function OPDSBrowser:downloadFile(item, format, remote_url)
local local_path = download_dir .. "/" .. item.author .. ' - ' .. item.title .. "." .. string.lower(format)
logger.dbg("downloading file", local_path, "from", remote_url)
local_path = util.fixUtf8(local_path, "_")
local parsed = url.parse(remote_url)
http.TIMEOUT, https.TIMEOUT = 20, 20
local httpRequest = parsed.scheme == 'http' and http.request or https.request

@ -257,13 +257,13 @@ end
function util.replaceInvalidChars(str)
if str then
return str:gsub('[\\,%/,:,%*,%?,%",%<,%>,%|]','_'):gsub("([\224-\244]+)",'_')
return str:gsub('[\\,%/,:,%*,%?,%",%<,%>,%|]','_')
end
end
function util.replaceSlashChar(str)
if str then
return str:gsub('%/','_'):gsub("([\224-\244]+)",'_')
return str:gsub('%/','_')
end
end
@ -299,4 +299,31 @@ function util.getMenuText(item)
return text
end
-- from http://notebook.kulchenko.com/programming/fixing-malformed-utf8-in-lua with modification
--- Replaces invalid UTF-8 characters with a replacement string.
---- @string str the string to be checked for invalid characters
---- @string replacement the string to replace invalid characters with
---- @treturn string valid UTF-8
function util.fixUtf8(str, replacement)
local pos = 1
local len = #str
while pos <= len do
if pos == str:find("[%z\1-\127]", pos) then pos = pos + 1
elseif pos == str:find("[\194-\223][\128-\191]", pos) then pos = pos + 2
elseif pos == str:find( "\224[\160-\191][\128-\191]", pos)
or pos == str:find("[\225-\236][\128-\191][\128-\191]", pos)
or pos == str:find( "\237[\128-\159][\128-\191]", pos)
or pos == str:find("[\238-\239][\128-\191][\128-\191]", pos) then pos = pos + 3
elseif pos == str:find( "\240[\144-\191][\128-\191][\128-\191]", pos)
or pos == str:find("[\241-\243][\128-\191][\128-\191][\128-\191]", pos)
or pos == str:find( "\244[\128-\143][\128-\191][\128-\191]", pos) then pos = pos + 4
else
str = str:sub(1, pos - 1) .. replacement .. str:sub(pos + 1)
pos = pos + #replacement
len = len + #replacement - 1
end
end
return str
end
return util

@ -222,4 +222,33 @@ describe("util module", function()
test("", "", "")
assert.are_same(util.splitFileNameSuffix("a.txt"), "a")
end)
it("should replace invalid UTF-8 characters with an underscore", function()
assert.is_equal(util.fixUtf8("\127 \128 \194\127 ", "_"), "\127 _ _\127 ")
end)
it("should replace invalid UTF-8 characters with multiple characters", function()
assert.is_equal(util.fixUtf8("\127 \128 \194\127 ", "__"), "\127 __ __\127 ")
end)
it("should replace invalid UTF-8 characters with empty char", function()
assert.is_equal(util.fixUtf8("\127 \128 \194\127 ", ""), "\127 \127 ")
end)
it("should not replace valid UTF-8 <20> character", function()
assert.is_equal(util.fixUtf8("<EFBFBD>valid <20> char <20>", "__"), "<EFBFBD>valid <20> char <20>")
end)
it("should not replace valid UTF-8 characters", function()
assert.is_equal(util.fixUtf8("\99 \244\129\130\190", "_"), "\99 \244\129\130\190")
end)
it("should not replace valid UTF-8 characters Polish chars", function()
assert.is_equal(util.fixUtf8("Pójdźże źółć", "_"), "Pójdźże źółć")
end)
it("should not replace valid UTF-8 characters German chars", function()
assert.is_equal(util.fixUtf8("glück schließen", "_"), "glück schließen")
end)
end)

Loading…
Cancel
Save