[data-scanner] convert to using re2c

pull/242/head
Timothy Stack 9 years ago
parent bb5f3cc9c4
commit 054e8d489b

@ -2,7 +2,7 @@
language: cpp
compiler: gcc
before_install:
- sudo apt-get install -y -qq libgpm-dev
- sudo apt-get install -y -qq libgpm-dev re2c
script: ./autogen.sh && ./configure && make && make check && make distcheck
after_script: test -e test/test-suite.log && cat test/test-suite.log

@ -23,6 +23,7 @@ Lnav requires the following software packages:
readline - The readline line editing library.
zlib - The zlib compression library.
bz2 - The bzip2 compression library.
re2c - The re2c scanner generator.
INSTALLATION

@ -66,6 +66,8 @@ AC_PROG_LN_S
AC_PROG_MAKE_SET
AC_PATH_PROG(BZIP2_CMD, [bzip2])
AC_PATH_PROG(RE2C_CMD, [re2c])
AM_CONDITIONAL(HAVE_RE2C, test -n x"$RE2C_CMD")
AC_CHECK_SIZEOF(off_t)
AC_CHECK_SIZEOF(size_t)

@ -13,6 +13,10 @@ BIN2C_V = $(BIN2C_V_@AM_V@)
BIN2C_V_ = $(BIN2C_V_@AM_DEFAULT_V@)
BIN2C_V_0 = @echo " BIN2C " $@;
RE2C_V = $(RE2C_V_@AM_V@)
RE2C_V_ = $(RE2C_V_@AM_DEFAULT_V@)
RE2C_V_0 = @echo " RE2C " $@;
help.c: $(srcdir)/help.txt bin2c
$(BIN2C_V)./bin2c -z -c $(srcdir)/help.txt $@
@ -53,6 +57,12 @@ TIME_FORMATS = \
time_fmts.cc: ptimec
$(PTIME_V)./ptimec $(TIME_FORMATS) > $@
if HAVE_RE2C
%.cc: %.re
$(RE2C_V)$(RE2C_CMD) -o $@ $<
$(REC2_V)test $@ -ef $(srcdir)/$*.cc || cp $@ $(srcdir)/$*.cc
endif
AM_LDFLAGS = \
$(STATIC_LDFLAGS) \
$(READLINE_LDFLAGS) \
@ -90,6 +100,7 @@ noinst_HEADERS = \
column_namer.hh \
concise_index.hh \
data_scanner.hh \
data_scanner_re.re \
data_parser.hh \
default-log-formats-json.hh \
db_sub_source.hh \
@ -197,6 +208,7 @@ libdiag_a_SOURCES = \
logfile_sub_source.cc \
network-extension-functions.cc \
data_scanner.cc \
data_scanner_re.cc \
data_parser.cc \
ptimec_rt.cc \
readline_curses.cc \
@ -265,6 +277,7 @@ ptimec_SOURCES = ptimec.cc
ptimec_LDADD =
DISTCLEANFILES = \
data_scanner_re.cc \
dump-pid-sh.c \
help.c \
init-sql.c \

@ -769,7 +769,7 @@ private:
data_format_state_t comma_state = DFS_INIT;
memset(hist, 0, sizeof(hist));
while (this->dp_scanner->tokenize(pc, elem.e_token)) {
while (this->dp_scanner->tokenize2(pc, elem.e_token)) {
pcre_context::iterator pc_iter;
pc_iter = std::find_if(pc.begin(), pc.end(), capture_if_not(-1));

@ -128,6 +128,7 @@ public:
};
bool tokenize(pcre_context &pc, data_token_t &token_out);
bool tokenize2(pcre_context &pc, data_token_t &token_out);
pcre_input &get_input() { return this->ds_pcre_input; };

File diff suppressed because it is too large Load Diff

@ -0,0 +1,218 @@
/**
* Copyright (c) 2015, Timothy Stack
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* * Neither the name of Timothy Stack nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include <arpa/inet.h>
#include <netinet/in.h>
#include <sys/socket.h>
#include "data_scanner.hh"
bool data_scanner::tokenize2(pcre_context &pc, data_token_t &token_out)
{
# define YYCTYPE char
# define CAPTURE(tok) { \
pi.pi_next_offset = YYCURSOR.val - pi.get_string(); \
cap[0].c_end = pi.pi_next_offset; \
cap[1].c_end = pi.pi_next_offset; \
token_out = tok; \
}
# define RET(tok) { \
CAPTURE(tok); \
return true; \
}
pcre_input &pi = this->ds_pcre_input;
struct _YYCURSOR {
const YYCTYPE operator*() const {
if (this->val < this->lim) {
return *val;
}
return '\0';
}
operator const YYCTYPE *() const {
return this->val;
}
const YYCTYPE *operator=(const YYCTYPE *rhs) {
this->val = rhs;
return rhs;
}
const YYCTYPE *operator+(int rhs) {
return this->val + rhs;
}
_YYCURSOR& operator++() {
this->val += 1;
return *this;
}
const YYCTYPE *val;
const YYCTYPE *lim;
} YYCURSOR;
YYCURSOR = pi.get_string() + pi.pi_next_offset;
const YYCTYPE *YYLIMIT = pi.get_string() + pi.pi_length;
const YYCTYPE *YYMARKER = YYCURSOR;
const YYCTYPE *YYCTXMARKER = YYCURSOR;
pcre_context::capture_t *cap = pc.all();
YYCURSOR.lim = YYLIMIT;
pc.set_count(2);
cap[0].c_begin = pi.pi_next_offset;
cap[1].c_begin = pi.pi_next_offset;
/*!re2c
re2c:yyfill:enable = 0;
SPACE = [ \t\r\n];
ALPHA = [a-zA-Z];
NUM = [0-9];
ALPHANUM = [a-zA-Z0-9_];
EOF = "\x00";
IPV4SEG = ("25"[0-5]|("2"[0-4]|"1"{0,1}[0-9]){0,1}[0-9]);
IPV4ADDR = (IPV4SEG"."){3,3}IPV4SEG;
IPV6SEG = [0-9a-fA-F]{1,4};
IPV6ADDR = (
(IPV6SEG":"){7,7}IPV6SEG|
(IPV6SEG":"){1,7}":"|
(IPV6SEG":"){1,6}":"IPV6SEG|
(IPV6SEG":"){1,5}(":"IPV6SEG){1,2}|
(IPV6SEG":"){1,4}(":"IPV6SEG){1,3}|
(IPV6SEG":"){1,3}(":"IPV6SEG){1,4}|
(IPV6SEG":"){1,2}(":"IPV6SEG){1,5}|
IPV6SEG":"((":"IPV6SEG){1,6})|
":"((":"IPV6SEG){1,7}|":")|
[a-fA-F0-9]{4}":"(":"IPV6SEG){0,4}"%"[0-9a-zA-Z]{1,}|
"::"('ffff'(":0"{1,4}){0,1}":"){0,1}IPV4ADDR|
(IPV6SEG":"){1,4}":"IPV4ADDR
);
EOF { return false; }
("u"|"r")?'"'('\\'.|[^\x00\"])*'"' {
CAPTURE(DT_QUOTED_STRING);
switch (pi.get_string()[cap[1].c_begin]) {
case 'u':
case 'r':
cap[1].c_begin += 1;
break;
}
cap[1].c_begin += 1;
cap[1].c_end -= 1;
return true;
}
("u"|"r")?"'"('\\'.|[^\x00\'])*"'" {
CAPTURE(DT_QUOTED_STRING);
switch (pi.get_string()[cap[1].c_begin]) {
case 'u':
case 'r':
cap[1].c_begin += 1;
break;
}
cap[1].c_begin += 1;
cap[1].c_end -= 1;
return true;
}
[a-zA-Z0-9]+"://"[^\x00\r\n\t '"\[\](){}]+[/a-zA-Z0-9\-=&] { RET(DT_URL); }
("/"|"./"|"../")[a-zA-Z0-9_\.\-_\~/]* { RET(DT_PATH); }
(SPACE|NUM)NUM":"NUM{2}/[^:] { RET(DT_TIME); }
(SPACE|NUM)NUM":"NUM{2}":"NUM{2}("."NUM{3,6})?/[^:] { RET(DT_TIME); }
[0-9a-fA-F][0-9a-fA-F](":"[0-9a-fA-F][0-9a-fA-F])+ {
if ((YYCURSOR - pi.get_string()) == 17) {
RET(DT_MAC_ADDRESS);
} else {
RET(DT_HEX_DUMP);
}
}
(NUM{4}"/"NUM{1,2}"/"NUM{1,2}|NUM{4}"-"NUM{1,2}"-"NUM{1,2}|NUM{2}"/"ALPHA{3}"/"NUM{4})"T"? {
RET(DT_DATE);
}
IPV6ADDR/[^:a-zA-Z0-9] { RET(DT_IPV6_ADDRESS); }
"<""?"?[a-zA-Z0-9_:]+SPACE*([a-zA-Z0-9_:]+(SPACE*'='SPACE*('"'(('\\'.|[^\x00"])+)'"'|"'"(('\\'.|[^\x00'])+)"'"|[^\x00>]+)))*SPACE*("/"|"?")">" {
RET(DT_XML_EMPTY_TAG);
}
"<"[a-zA-Z0-9_:]+SPACE*([a-zA-Z0-9_:]+(SPACE*"="SPACE*('"'(('\\'.|[^\x00"])+)'"'|"'"(('\\'.|[^\x00'])+)"'"|[^\x00>]+)))*SPACE*">" {
RET(DT_XML_OPEN_TAG);
}
"</"[a-zA-Z0-9:]+SPACE*">" {
RET(DT_XML_CLOSE_TAG);
}
":" { RET(DT_COLON); }
"=" { RET(DT_EQUALS); }
"," { RET(DT_COMMA); }
";" { RET(DT_SEMI); }
"{" { RET(DT_LCURLY); }
"}" { RET(DT_RCURLY); }
"[" { RET(DT_LSQUARE); }
"]" { RET(DT_RSQUARE); }
"(" { RET(DT_LPAREN); }
")" { RET(DT_RPAREN); }
"<" { RET(DT_LANGLE); }
">" { RET(DT_RANGLE); }
IPV4ADDR/[^0-9] {
RET(DT_IPV4_ADDRESS);
}
[0-9a-fA-F]{8}("-"[0-9a-fA-F]{4}){3}"-"[0-9a-fA-F]{12} { RET(DT_UUID); }
[0-9]"."[0-9]+'e'[\-\+][0-9]+ { RET(DT_NUMBER); }
[0-9]+("."[0-9]+[a-zA-Z0-9_]*){2,}("-"[a-zA-Z0-9_]+)?|[0-9]+("."[0-9]+[a-zA-Z0-9_]*)+"-"[a-zA-Z0-9_]+ {
RET(DT_VERSION_NUMBER);
}
"-"?"0"[0-7]+ { RET(DT_OCTAL_NUMBER); }
"-"?[0-9]+("."[0-9]+)?[ ]*"%" { RET(DT_PERCENTAGE); }
"-"?[0-9]+("."[0-9]+)?([eE][\-+][0-9]+)? { RET(DT_NUMBER); }
"-"?("0x"|[0-9])[0-9a-fA-F]+ { RET(DT_HEX_NUMBER); }
[a-zA-Z0-9\._%+-]+"@"[a-zA-Z0-9\.-]+"."[a-zA-Z]+ { RET(DT_EMAIL); }
("true"|"True"|"TRUE"|"false"|"False"|"FALSE"|"None"|"null"|"NULL") { RET(DT_CONSTANT); }
[a-zA-Z][a-z']+/([\r\n\t \(\)!\*:;'\"\?,]|[\.\!,\?]SPACE|EOF) { RET(DT_WORD); }
[^\x00"; \t\r\n:=,\(\)\{\}\[\]\+#!@%\^&\*'\?<>\~`\|\\]+("::"[^\x00"; \r\n\t:=,\(\)\{\}\[\]\+#!@%\^&\*'\?<>\~`\|\\]+)* {
RET(DT_SYMBOL);
}
("\r"?"\n"|"\n") { RET(DT_LINE); }
SPACE+ { RET(DT_WHITE); }
"." { RET(DT_DOT); }
. { RET(DT_GARBAGE); }
*/
}

@ -496,7 +496,7 @@ static void add_text_possibilities(
data_scanner ds(str);
data_token_t dt;
while (ds.tokenize(pc, dt)) {
while (ds.tokenize2(pc, dt)) {
if (pc[0]->length() < 4) {
continue;
}

@ -79,7 +79,7 @@ public:
pcre_context_static<30> pc;
data_token_t dt;
while (this->pp_scanner->tokenize(pc, dt)) {
while (this->pp_scanner->tokenize2(pc, dt)) {
element el(dt, pc);
switch (dt) {

@ -398,6 +398,8 @@ check_output "pretty-printer is not working" <<EOF
</value>
</response>
EOF
@ -424,6 +426,7 @@ Apr 7 05:49:53 Tim-Stacks-iMac.local GoogleSoftwareUpdateDaemon[17212]: -[KSUpd
activeTickets=1
rollCallTickets=1
body=
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<o:gupdate xmlns:o="http://www.google.com/update2/request" protocol="2.0" version="KeystoneDaemon-1.2.0.7709" ismachine="1" requestid="{0DFDBCD1-5E29-4DFC-BD99-31A2397198FE}">
<o:os platform="mac" version="MacOSX" sp="10.10.2_x86_64h"></o:os>

Loading…
Cancel
Save