You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
lnav/src/data_parser.hh

630 lines
17 KiB
C++

/**
* Copyright (c) 2007-2012, Timothy Stack
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* * Neither the name of Timothy Stack nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __data_parser_hh
#define __data_parser_hh
#include <stdio.h>
#include <openssl/sha.h>
#include <list>
#include <vector>
#include <iterator>
#include <algorithm>
#include "pcrepp.hh"
#include "byte_array.hh"
#include "data_scanner.hh"
/**
* Switch to the 'parser' view mode when the user hits ';' so they
* can easily see what columns are available.
*
* select * from logline;
* select itemfrom(csv_key, 0) from logline;
* select itemfrom(csv_key, -1) from logline;
* select itemfrom(dict_key, "key") from logline;
* select itemfrom(dict_key, "key[0]") from logline;
* select itemfrom(csv_key, 0:3) from logline; support splices ?
*
* Add a command to create a logline table with a given name so the user can
* do joins across the tables:
*
* create-logline-table sudo_logline
* select * from logline, sudo_logline where sudo_logline.COMMAND=logline.COMMAND;
*
* select timestmap / 60 as minute, sc_status, count(*) from access_log
* group by minute, sc_status
* order by minute, sc_status desc;
* (use group_concat() here?)
*
* The "itemfrom()" function parses the group and lets you specify an
* expression to query the contents.
*
* For 'report-on' command:
* 'report-on PWD'
* select PWD,count(*) as amount from logline group by PWD order by amount desc;
* 'report-on num_col num_col2'
* select avg(num_col),stddev(num_col),... from logline;
* Instead of a command, we should automatically create views with the
* relevant select statements.
*
* Add a tojson() aggregate function to sqlite:
* select foo,tojson(bar) group by foo;
*
* 1 ["a", "b", "c"]
* 2 ["d", "e", "f"]
*
* We should automatically detect sqlite files provided on the command line
* and attach the database.
*
* add a 'metadata' view that has all the metadata crud (sql tables/log)
*
* Add support for sqlite_log that writes to a temp file and is displayed in the
* metadata view.
*
* Add a function that bookmarks all lines in the log view based on line_numbers
* in the sql query result.
*
* select line_number from logline where A="b";
* hit 'y/Y' to move forward and backwards through sql results
* hit 'R' key and all the lines are bookmarked
*
* add path manipulation functions like basename, dirname, splitext
*
* use the vt52_curses emulation to embed a editor for editing queries. For
* example, you could hit 'ctrl+;' and it would split the window in half with
* the bottom being used for nano. When the file was written, lnav should
* notice and do a 'prepare' on the sql to make sure it is correct.
*
* Maybe add other tables for accessing lnav state. For example, you could do
* a query to find log lines of interest and then insert their line numbers
* into the 'bookmarks' table to create new user bookmarks.
*/
template<class Container, class UnaryPredicate>
void strip(Container &container, UnaryPredicate p)
{
while (!container.empty() && p(container.front())) {
container.pop_front();
}
while (!container.empty() && p(container.back())) {
container.pop_back();
}
}
enum data_format_state_t {
DFS_ERROR = -1,
DFS_INIT,
DFS_KEY,
DFS_VALUE,
};
struct data_format {
data_format(data_token_t appender = DT_INVALID,
data_token_t terminator = DT_INVALID)
: df_appender(appender), df_terminator(terminator)
{
};
const data_token_t df_appender;
const data_token_t df_terminator;
};
data_format_state_t dfs_semi_next(data_format_state_t state,
data_token_t next_token);
data_format_state_t dfs_comma_next(data_format_state_t state,
data_token_t next_token);
class data_parser {
public:
static data_format FORMAT_SEMI;
static data_format FORMAT_COMMA;
static data_format FORMAT_PLAIN;
typedef byte_array<20> schema_id_t;
struct element;
typedef std::list<element> element_list_t;
struct element {
element() : e_token(DT_INVALID), e_sub_elements(NULL) { };
element(element_list_t &subs,
data_token_t token,
bool assign_subs_elements = true)
: e_capture(subs.front().e_capture.c_begin,
subs.back().e_capture.c_end),
e_token(token),
e_sub_elements(NULL) {
if (assign_subs_elements) {
this->assign_elements(subs);
}
};
element(const element &other) {
// assert(other.e_sub_elements == NULL);
this->e_capture = other.e_capture;
this->e_token = other.e_token;
this->e_sub_elements = NULL;
if (other.e_sub_elements != NULL)
this->assign_elements(*other.e_sub_elements);
};
~element() {
if (this->e_sub_elements != NULL) {
delete this->e_sub_elements;
this->e_sub_elements = NULL;
}
};
element& operator=(const element &other) {
this->e_capture = other.e_capture;
this->e_token = other.e_token;
this->e_sub_elements = NULL;
if (other.e_sub_elements != NULL)
this->assign_elements(*other.e_sub_elements);
return *this;
};
void assign_elements(element_list_t &subs) {
if (this->e_sub_elements == NULL)
this->e_sub_elements = new element_list_t();
this->e_sub_elements->swap(subs);
this->update_capture();
};
void update_capture(void) {
if (this->e_sub_elements != NULL) {
this->e_capture.c_begin =
this->e_sub_elements->front().e_capture.c_begin;
this->e_capture.c_end =
this->e_sub_elements->back().e_capture.c_end;
}
};
data_token_t value_token(void) const {
data_token_t retval = DT_INVALID;
if (this->e_token == DNT_VALUE &&
this->e_sub_elements != NULL &&
this->e_sub_elements->size() == 1) {
retval = this->e_sub_elements->front().e_token;
}
return retval;
};
void print(FILE *out, pcre_input &pi, int offset = 0) {
int lpc;
if (this->e_sub_elements != NULL) {
for (element_list_t::iterator iter2 =
this->e_sub_elements->begin();
iter2 != this->e_sub_elements->end();
++iter2) {
iter2->print(out, pi, offset + 1);
}
}
fprintf(out, "%4s %3d:%-3d ",
data_scanner::token2name(this->e_token),
this->e_capture.c_begin,
this->e_capture.c_end);
for (lpc = 0; lpc < this->e_capture.c_end; lpc++) {
if (lpc == this->e_capture.c_begin)
fputc('^', out);
else if (lpc == (this->e_capture.c_end - 1))
fputc('^', out);
else if (lpc > this->e_capture.c_begin)
fputc('-', out);
else
fputc(' ', out);
}
for (; lpc < (int)pi.pi_length; lpc++) {
fputc(' ', out);
}
std::string sub = pi.get_substr(&this->e_capture);
fprintf(out, " %s\n", sub.c_str());
};
pcre_context::capture_t e_capture;
data_token_t e_token;
element_list_t *e_sub_elements;
};
struct element_cmp {
bool operator()(data_token_t token, const element &elem) const {
return token == elem.e_token || token == DT_ANY;
};
bool operator()(const element &elem, data_token_t token) const {
return (*this)(token, elem);
};
};
struct element_if {
element_if(data_token_t token) : ei_token(token) { };
bool operator()(const element &a) const {
return a.e_token == this->ei_token;
};
private:
data_token_t ei_token;
};
data_parser(data_scanner *ds) : dp_format(NULL), dp_scanner(ds) { };
void pairup(schema_id_t *schema, element_list_t &pairs_out, element_list_t &in_list) {
element_list_t el_stack, free_row, key_comps, value, prefix;
SHA_CTX context;
for (element_list_t::iterator iter = in_list.begin();
iter != in_list.end();
++iter) {
if (iter->e_token == DNT_GROUP) {
element_list_t group_pairs;
this->pairup(NULL, group_pairs, *iter->e_sub_elements);
if (!group_pairs.empty()) {
iter->assign_elements(group_pairs);
}
}
if (iter->e_token == DT_SEPARATOR) {
element_list_t::iterator key_iter = key_comps.end();
bool found = false;
--key_iter;
for (;
key_iter != key_comps.begin() && !found;
--key_iter) {
if (key_iter->e_token == this->dp_format->df_appender) {
++key_iter;
value.splice(value.end(),
key_comps,
key_comps.begin(),
key_iter);
key_comps.splice(key_comps.begin(),
key_comps,
key_comps.end());
key_comps.resize(1);
found = true;
}
else if (key_iter->e_token == this->dp_format->df_terminator) {
std::vector<element> key_copy;
value.splice(value.end(),
key_comps,
key_comps.begin(),
key_iter);
++key_iter;
key_comps.pop_front();
strip(key_comps, element_if(DT_WHITE));
found = true;
}
}
if (!found && !el_stack.empty() && !key_comps.empty()) {
element_list_t::iterator value_iter;
value.splice(value.end(),
key_comps,
key_comps.begin(),
key_comps.end());
value_iter = value.end();
std::advance(value_iter, -1);
key_comps.splice(key_comps.begin(),
value,
value_iter);
key_comps.resize(1);
}
strip(value, element_if(DT_WHITE));
value.remove_if(element_if(DT_COMMA));
if (!value.empty()) {
el_stack.push_back(element(value, DNT_VALUE));
}
strip(key_comps, element_if(DT_WHITE));
if (!key_comps.empty()) {
el_stack.push_back(element(key_comps, DNT_KEY, false));
}
key_comps.clear();
value.clear();
}
else {
key_comps.push_back(*iter);
}
}
if (el_stack.empty()) {
free_row.splice(free_row.begin(),
key_comps, key_comps.begin(), key_comps.end());
}
else {
value.splice(value.begin(),
key_comps,
key_comps.begin(),
key_comps.end());
strip(value, element_if(DT_WHITE));
value.remove_if(element_if(DT_COMMA));
if (!value.empty()) {
el_stack.push_back(element(value, DNT_VALUE));
}
}
SHA_Init(&context);
while (!el_stack.empty()) {
element_list_t::iterator kv_iter = el_stack.begin();
if (kv_iter->e_token == DNT_VALUE) {
free_row.push_back(el_stack.front());
}
if (kv_iter->e_token != DNT_KEY) {
el_stack.pop_front();
continue;
}
++kv_iter;
if (kv_iter == el_stack.end()) {
el_stack.pop_front();
continue;
}
if (kv_iter->e_token != DNT_VALUE) {
el_stack.pop_front();
continue;
}
std::string key_val = this->get_element_string(el_stack.front());
element_list_t pair_subs;
if (schema != NULL) {
SHA_Update(&context, key_val.c_str(), key_val.length());
}
++kv_iter;
pair_subs.splice(pair_subs.begin(),
el_stack,
el_stack.begin(),
kv_iter);
pairs_out.push_back(element(pair_subs, DNT_PAIR));
}
if (pairs_out.size() == 1) {
element &pair = pairs_out.front();
element &value = pair.e_sub_elements->back();
if (value.e_token == DNT_VALUE &&
value.e_sub_elements != NULL &&
value.e_sub_elements->size() > 1) {
prefix.splice(prefix.begin(),
*pair.e_sub_elements,
pair.e_sub_elements->begin());
free_row.clear();
free_row.splice(free_row.begin(),
*value.e_sub_elements,
value.e_sub_elements->begin(),
value.e_sub_elements->end());
pairs_out.clear();
SHA_Init(&context);
}
}
if (pairs_out.empty() && !free_row.empty()) {
while (!free_row.empty()) {
switch (free_row.front().e_token) {
case DNT_GROUP:
case DT_NUMBER:
case DT_SYMBOL:
case DT_HEX_NUMBER:
case DT_OCTAL_NUMBER:
case DT_VERSION_NUMBER:
case DT_QUOTED_STRING:
case DT_IPV4_ADDRESS:
case DT_IPV6_ADDRESS:
case DT_MAC_ADDRESS:
case DT_UUID:
case DT_URL:
case DT_PATH:
case DT_TIME:
case DT_PERCENTAGE: {
element_list_t pair_subs;
struct element blank;
blank.e_capture.c_begin = blank.e_capture.c_end = free_row.front().e_capture.c_begin;
blank.e_token = DNT_KEY;
pair_subs.push_back(blank);
pair_subs.push_back(free_row.front());
pairs_out.push_back(element(pair_subs, DNT_PAIR));
}
break;
default: {
std::string key_val = this->get_element_string(free_row.front());
SHA_Update(&context, key_val.c_str(), key_val.length());
}
break;
}
free_row.pop_front();
}
}
if (!prefix.empty()) {
element_list_t pair_subs;
struct element blank;
blank.e_capture.c_begin = blank.e_capture.c_end = prefix.front().e_capture.c_begin;
blank.e_token = DNT_KEY;
pair_subs.push_back(blank);
pair_subs.push_back(prefix.front());
pairs_out.push_front(element(pair_subs, DNT_PAIR));
}
if (schema != NULL)
SHA_Final(this->dp_schema_id.ba_data, &context);
};
void discover_format(void) {
pcre_context_static<30> pc;
int hist[DT_TERMINAL_MAX];
struct element elem;
this->dp_group_token.push_back(DT_INVALID);
this->dp_group_stack.resize(1);
data_format_state_t semi_state = DFS_INIT;
data_format_state_t comma_state = DFS_INIT;
memset(hist, 0, sizeof(hist));
while (this->dp_scanner->tokenize(pc, elem.e_token)) {
pcre_context::iterator pc_iter;
pc_iter = std::find_if(pc.begin(), pc.end(), capture_if_not(-1));
assert(pc_iter != pc.end());
elem.e_capture = *pc_iter;
assert(elem.e_capture.c_begin != -1);
assert(elem.e_capture.c_end != -1);
semi_state = dfs_semi_next(semi_state, elem.e_token);
comma_state = dfs_comma_next(comma_state, elem.e_token);
hist[elem.e_token] += 1;
switch (elem.e_token) {
case DT_LPAREN:
case DT_LANGLE:
case DT_LCURLY:
case DT_LSQUARE:
this->dp_group_token.push_back(elem.e_token);
this->dp_group_stack.push_back(element_list_t());
break;
case DT_RPAREN:
case DT_RANGLE:
case DT_RCURLY:
case DT_RSQUARE:
if (this->dp_group_token.back() == (elem.e_token - 1)) {
this->dp_group_token.pop_back();
std::list<element_list_t>::reverse_iterator riter = this->dp_group_stack.rbegin();
++riter;
if (!this->dp_group_stack.back().empty()) {
(*riter).push_back(element(this->dp_group_stack.back(), DNT_GROUP));
}
this->dp_group_stack.pop_back();
}
else {
this->dp_group_stack.back().push_back(elem);
}
break;
default:
this->dp_group_stack.back().push_back(elem);
break;
}
}
while (this->dp_group_stack.size() > 1) {
this->dp_group_token.pop_back();
std::list<element_list_t>::reverse_iterator riter = this->dp_group_stack.rbegin();
++riter;
if (!this->dp_group_stack.back().empty()) {
(*riter).push_back(element(this->dp_group_stack.back(), DNT_GROUP));
}
this->dp_group_stack.pop_back();
}
if (semi_state != DFS_ERROR && hist[DT_SEMI]) {
this->dp_format = &FORMAT_SEMI;
}
else if (comma_state != DFS_ERROR) {
this->dp_format = &FORMAT_COMMA;
}
else {
this->dp_format = &FORMAT_PLAIN;
}
};
void parse(void) {
this->discover_format();
this->pairup(&this->dp_schema_id,
this->dp_pairs,
this->dp_group_stack.front());
for (element_list_t::iterator iter = this->dp_pairs.begin();
iter != this->dp_pairs.end();
++iter) {
if (iter->e_token == DNT_PAIR) {
element_list_t &pair_subs = *iter->e_sub_elements;
std::string key_val = this->get_element_string(pair_subs.front());
}
}
};
std::string get_element_string(element &elem) {
pcre_input &pi = this->dp_scanner->get_input();
return pi.get_substr(&elem.e_capture);
};
void print(FILE *out, element_list_t &el) {
fprintf(out, " %s\n",
this->dp_scanner->get_input().get_string());
for (element_list_t::iterator iter = el.begin();
iter != el.end();
++iter) {
iter->print(out, this->dp_scanner->get_input());
}
};
std::vector<data_token_t> dp_group_token;
std::list<element_list_t> dp_group_stack;
element_list_t dp_errors;
element_list_t dp_pairs;
schema_id_t dp_schema_id;
data_format *dp_format;
private:
data_scanner *dp_scanner;
};
#endif