[fix] Don't break OPDS parsing on HR tags (#5949)

Apply the same crappy workaround as for BR.

Fix #5948
reviewable/pr5962/r2
NiLuJe 4 years ago committed by GitHub
parent bb0c01757d
commit 098c1a7844
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -8,6 +8,7 @@
Pure Lua Version written by: William A Adams
Dramatic Speed Improvements by: Robert G Jakabosky
https://github.com/Wiladams/LAPHLibs/blob/master/laphlibs/luxl.lua
References
@ -20,10 +21,15 @@ local ffi = require "ffi"
local bit = require "bit"
local band = bit.band
--[[
Types of characters; 0 is not valid, 1 is letters, 2 are digits
(including '.') and 3 whitespace.
Types of characters;
0 is not valid
1 is letters,
2 are digits (including '.')
3 whitespace
--]]
local char_type = ffi.new("const int[256]", {
0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 3, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@ -43,22 +49,24 @@ local char_type = ffi.new("const int[256]", {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
});
-- Internal states that the parser can be in at any given time.
local ST_START = 0; -- starting base state; default state
local ST_START = 0; -- starting base state; default state
local ST_TEXT =1; -- text state
local ST_START_TAG = 2; -- start tag state
local ST_START_TAG = 2; -- start tag state
local ST_START_TAGNAME =3; -- start tagname state
local ST_START_TAGNAME_END =4; -- start tagname ending state
local ST_END_TAG =5; -- end tag state
local ST_END_TAGNAME=6; -- end tag tagname state
local ST_END_TAGNAME_END=7; -- end tag tagname ending
local ST_EMPTY_TAG=8; -- empty tag state
local ST_SPACE=9; -- linear whitespace state
local ST_END_TAGNAME=6; -- end tag tagname state
local ST_END_TAGNAME_END=7; -- end tag tagname ending
local ST_EMPTY_TAG=8; -- empty tag state
local ST_SPACE=9; -- linear whitespace state
local ST_ATTR_NAME=10; -- attribute name state
local ST_ATTR_NAME_END=11; -- attribute name ending state
local ST_ATTR_VAL=12; -- attribute value starting state
local ST_ATTR_VAL2=13; -- attribute value state
local ST_ERROR=14; -- error state
local ST_ERROR=14; -- error state
-- character classes that we will match against; This could be expanded if
-- need be, however, we are aiming for simple.
@ -74,14 +82,14 @@ local CCLASS_ANY=8; -- matches any ASCII character; will match all ab
-- Types of events: start element, end element, text, attr name, attr
-- val and start/end document. Other events can be ignored!
local EVENT_START = 0; -- Start tag
local EVENT_END = 1; -- End tag
local EVENT_TEXT = 2; -- Text
local EVENT_ATTR_NAME = 3; -- Attribute name
local EVENT_ATTR_VAL = 4; -- Attribute value
local EVENT_END_DOC = 5; -- End of document
local EVENT_MARK = 6; -- Internal only; notes position in buffer
local EVENT_NONE = 7; -- Internal only; should never see this event
local EVENT_START = 0; -- Start tag
local EVENT_END = 1; -- End tag
local EVENT_TEXT = 2; -- Text
local EVENT_ATTR_NAME = 3; -- Attribute name
local EVENT_ATTR_VAL = 4; -- Attribute value
local EVENT_END_DOC = 5; -- End of document
local EVENT_MARK = 6; -- Internal only; notes position in buffer
local EVENT_NONE = 7; -- Internal only; should never see this event
local entity_refs = {
["&lt;"] = '<',
@ -133,7 +141,7 @@ local LEXER_STATES = {
-- [6-8] handle start tag
{ state = ST_START_TAG, cclass = CCLASS_LETTERS, next_state = ST_START_TAGNAME, event = EVENT_MARK },
{ state = ST_START_TAG, cclass = CCLASS_SLASH, next_state = ST_END_TAG, event = EVENT_MARK },
{ state = ST_START_TAG, cclass = CCLASS_SPACE, next_state = ST_START_TAG, event = EVENT_NONE }, -- < tag >
{ state = ST_START_TAG, cclass = CCLASS_SPACE, next_state = ST_START_TAG, event = EVENT_NONE }, -- < tag >
-- [9-12] handle start tag name
{ state = ST_START_TAGNAME, cclass = CCLASS_LETTERS, next_state = ST_START_TAGNAME, event = EVENT_NONE },
@ -145,10 +153,10 @@ local LEXER_STATES = {
{ state = ST_START_TAGNAME_END, cclass = CCLASS_LETTERS, next_state = ST_ATTR_NAME, event = EVENT_MARK },
{ state = ST_START_TAGNAME_END, cclass = CCLASS_SPACE, next_state = ST_START_TAGNAME_END, event = EVENT_NONE },
{ state = ST_START_TAGNAME_END, cclass = CCLASS_RIGHT_ANGLE, next_state = ST_START, event = EVENT_START },
{ state = ST_START_TAGNAME_END, cclass = CCLASS_SLASH, next_state = ST_EMPTY_TAG, event = EVENT_MARK }, -- Empty tag <br />
{ state = ST_START_TAGNAME_END, cclass = CCLASS_SLASH, next_state = ST_EMPTY_TAG, event = EVENT_MARK }, -- Empty tag <br />
-- [17] handle empty tags, e.g., <br />
{ state = ST_EMPTY_TAG, cclass = CCLASS_RIGHT_ANGLE, next_state = ST_START, event = EVENT_END }, -- Empty tag <br />
{ state = ST_EMPTY_TAG, cclass = CCLASS_RIGHT_ANGLE, next_state = ST_START, event = EVENT_END }, -- Empty tag <br />
-- [18] handle end tag, e.g., <tag />
{ state = ST_END_TAG, cclass = CCLASS_LETTERS, next_state = ST_END_TAGNAME, event = EVENT_NONE },
@ -156,7 +164,7 @@ local LEXER_STATES = {
-- [19-21] handle end tag name
{ state = ST_END_TAGNAME, cclass = CCLASS_LETTERS, next_state = ST_END_TAGNAME, event = EVENT_NONE },
{ state = ST_END_TAGNAME, cclass = CCLASS_RIGHT_ANGLE, next_state = ST_START, event = EVENT_END },
{ state = ST_END_TAGNAME, cclass = CCLASS_SPACE, next_state = ST_END_TAGNAME_END, event = EVENT_END }, -- space after end tag name </br >
{ state = ST_END_TAGNAME, cclass = CCLASS_SPACE, next_state = ST_END_TAGNAME_END, event = EVENT_END }, -- space after end tag name </br >
-- [22-23] handle ending of end tag name
{ state = ST_END_TAGNAME_END, cclass = CCLASS_SPACE, next_state = ST_END_TAGNAME_END, event = EVENT_NONE },
@ -169,8 +177,8 @@ local LEXER_STATES = {
-- [27-29] handle attribute names
{ state = ST_ATTR_NAME, cclass = CCLASS_LETTERS, next_state = ST_ATTR_NAME, event = EVENT_MARK },
{ state = ST_ATTR_NAME, cclass = CCLASS_SPACE, next_state = ST_ATTR_NAME_END, event = EVENT_ATTR_NAME }, -- space before '=' sign
{ state = ST_ATTR_NAME, cclass = CCLASS_EQUALS, next_state = ST_ATTR_VAL, event = EVENT_ATTR_NAME }, -- <tag attr ="2">
{ state = ST_ATTR_NAME, cclass = CCLASS_SPACE, next_state = ST_ATTR_NAME_END, event = EVENT_ATTR_NAME }, -- space before '=' sign
{ state = ST_ATTR_NAME, cclass = CCLASS_EQUALS, next_state = ST_ATTR_VAL, event = EVENT_ATTR_NAME }, -- <tag attr ="2">
-- [30-32] attribute name end
{ state = ST_ATTR_NAME_END, cclass = CCLASS_SPACE, next_state = ST_ATTR_NAME_END, event = EVENT_NONE },
@ -198,6 +206,7 @@ struct parse_state {
int i;
int ix; /* index into buffer */
};
]]
local cclass_match = {
@ -320,28 +329,28 @@ fsm_code = nil
local luxl = {
EVENT_START = EVENT_START; -- Start tag
EVENT_END = EVENT_END; -- End tag
EVENT_TEXT = EVENT_TEXT; -- Text
EVENT_START = EVENT_START; -- Start tag
EVENT_END = EVENT_END; -- End tag
EVENT_TEXT = EVENT_TEXT; -- Text
EVENT_ATTR_NAME = EVENT_ATTR_NAME; -- Attribute name
EVENT_ATTR_VAL = EVENT_ATTR_VAL; -- Attribute value
EVENT_END_DOC = EVENT_END_DOC; -- End of document
EVENT_MARK = EVENT_MARK; -- Internal only; notes position in buffer
EVENT_NONE = EVENT_NONE; -- Internal only; should never see this event
EVENT_ATTR_VAL = EVENT_ATTR_VAL; -- Attribute value
EVENT_END_DOC = EVENT_END_DOC; -- End of document
EVENT_MARK = EVENT_MARK; -- Internal only; notes position in buffer
EVENT_NONE = EVENT_NONE; -- Internal only; should never see this event
}
local luxl_mt = { __index = luxl }
function luxl.new(buffer, bufflen)
local newone = {
buf = ffi.cast("const uint8_t *", buffer); -- pointer to "uint8_t *" buffer (0 based)
bufsz = bufflen; -- size of input buffer
state = ST_START; -- current state
event = EVENT_NONE; -- current event
err = 0; -- number of errors thus far
markix = 0; -- offset of current item of interest
marksz = 0; -- size of current item of interest
MsgHandler = nil; -- Routine to handle messages
ErrHandler = nil; -- Routine to call when there's an error
buf = ffi.cast("const uint8_t *", buffer); -- pointer to "uint8_t *" buffer (0 based)
bufsz = bufflen; -- size of input buffer
state = ST_START; -- current state
event = EVENT_NONE; -- current event
err = 0; -- number of errors thus far
markix = 0; -- offset of current item of interest
marksz = 0; -- size of current item of interest
MsgHandler = nil; -- Routine to handle messages
ErrHandler = nil; -- Routine to call when there's an error
EventHandler = nil;
ps = ffi.new('struct parse_state', {
buf = buffer,
@ -357,13 +366,13 @@ function luxl.new(buffer, bufflen)
end
function luxl:Reset(buffer, bufflen)
self.buf = buffer -- pointer to "uint8_t *" buffer (0 based)
self.bufsz = bufflen -- size of input buffer
self.state = ST_START -- current state
self.event = EVENT_NONE -- current event
self.err = 0 -- number of errors thus far
self.markix = 0 -- offset of current item of interest
self.marksz = 0 -- size of current item of interest
self.buf = buffer -- pointer to "uint8_t *" buffer (0 based)
self.bufsz = bufflen -- size of input buffer
self.state = ST_START -- current state
self.event = EVENT_NONE -- current event
self.err = 0 -- number of errors thus far
self.markix = 0 -- offset of current item of interest
self.marksz = 0 -- size of current item of interest
local ps = self.ps
ps.buf = buffer
ps.bufsz = bufflen

@ -79,6 +79,9 @@ function OPDSParser:parse(text)
-- but will kick the ass of luxl
text = text:gsub("<br>", "<br />")
text = text:gsub("<br/>", "<br />")
-- Same deal with hr
text = text:gsub("<hr>", "<hr />")
text = text:gsub("<hr/>", "<hr />")
-- some OPDS catalogs wrap text in a CDATA section, remove it as it causes parsing problems
text = text:gsub("<!%[CDATA%[(.-)%]%]>", function (s)
return s:gsub( "%p", {["&"] = "&amp;", ["<"] = "&lt;", [">"] = "&gt;" } )

Loading…
Cancel
Save