You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
lnav/src/pcrepp/pcre2pp.cc

459 lines
13 KiB
C++

/**
* Copyright (c) 2022, Timothy Stack
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* * Neither the name of Timothy Stack nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* @file pcrepp.cc
*/
#include "pcre2pp.hh"
#include "config.h"
namespace lnav {
namespace pcre2pp {
std::string
quote(const char* unquoted)
{
std::string retval;
for (int lpc = 0; unquoted[lpc]; lpc++) {
if (isalnum(unquoted[lpc]) || unquoted[lpc] == '_'
|| unquoted[lpc] & 0x80)
{
retval.push_back(unquoted[lpc]);
} else {
retval.push_back('\\');
retval.push_back(unquoted[lpc]);
}
}
return retval;
}
match_data
code::create_match_data() const
{
auto_mem<pcre2_match_data> md(pcre2_match_data_free);
md = pcre2_match_data_create_from_pattern(this->p_code, nullptr);
return match_data{std::move(md)};
}
Result<code, compile_error>
code::from(string_fragment sf, int options)
{
compile_error ce;
auto_mem<pcre2_code> co(pcre2_code_free);
options |= PCRE2_UTF;
co = pcre2_compile(
sf.udata(), sf.length(), options, &ce.ce_code, &ce.ce_offset, nullptr);
if (co == nullptr) {
ce.ce_pattern = sf.to_string();
return Err(ce);
}
auto jit_rc = pcre2_jit_compile(co, PCRE2_JIT_COMPLETE);
if (jit_rc < 0) {
// log_error("failed to JIT compile pattern: %d", jit_rc);
}
return Ok(code{std::move(co), sf.to_string()});
}
code::named_captures
code::get_named_captures() const
{
named_captures retval;
pcre2_pattern_info(
this->p_code.in(), PCRE2_INFO_NAMECOUNT, &retval.nc_count);
pcre2_pattern_info(
this->p_code.in(), PCRE2_INFO_NAMEENTRYSIZE, &retval.nc_entry_size);
pcre2_pattern_info(
this->p_code.in(), PCRE2_INFO_NAMETABLE, &retval.nc_name_table);
return retval;
}
size_t
code::match_partial(string_fragment in) const
{
auto md = this->create_match_data();
auto length = in.length();
do {
auto rc = pcre2_match(this->p_code.in(),
in.udata(),
length,
0,
PCRE2_PARTIAL_HARD,
md.md_data.in(),
nullptr);
if (rc == PCRE2_ERROR_PARTIAL) {
return md.md_ovector[1];
}
if (length > 0) {
length -= 1;
}
} while (length > 0);
return 0;
}
const char*
code::get_name_for_capture(size_t index) const
{
for (const auto cap : this->get_named_captures()) {
if (cap.get_index() == index) {
return cap.get_name().data();
}
}
return nullptr;
}
size_t
code::get_capture_count() const
{
uint32_t retval;
pcre2_pattern_info(this->p_code.in(), PCRE2_INFO_CAPTURECOUNT, &retval);
return retval;
}
std::vector<string_fragment>
code::get_captures() const
{
bool in_class = false, in_escape = false, in_literal = false;
auto pat_frag = string_fragment::from_str(this->p_pattern);
std::vector<string_fragment> cap_in_progress;
std::vector<string_fragment> retval;
for (int lpc = 0; this->p_pattern[lpc]; lpc++) {
if (in_escape) {
in_escape = false;
if (this->p_pattern[lpc] == 'Q') {
in_literal = true;
}
} else if (in_class) {
if (this->p_pattern[lpc] == ']') {
in_class = false;
}
if (this->p_pattern[lpc] == '\\') {
in_escape = true;
}
} else if (in_literal) {
if (this->p_pattern[lpc] == '\\' && this->p_pattern[lpc + 1] == 'E')
{
in_literal = false;
lpc += 1;
}
} else {
switch (this->p_pattern[lpc]) {
case '\\':
in_escape = true;
break;
case '[':
in_class = true;
break;
case '(':
cap_in_progress.emplace_back(pat_frag.sub_range(lpc, lpc));
break;
case ')': {
if (!cap_in_progress.empty()) {
static const auto DEFINE_SF
= string_fragment::from_const("(?(DEFINE)");
auto& cap = cap_in_progress.back();
char first = '\0', second = '\0', third = '\0';
bool is_cap = false;
cap.sf_end = lpc + 1;
if (cap.length() >= 2) {
first = this->p_pattern[cap.sf_begin + 1];
}
if (cap.length() >= 3) {
second = this->p_pattern[cap.sf_begin + 2];
}
if (cap.length() >= 4) {
third = this->p_pattern[cap.sf_begin + 3];
}
if (cap.sf_begin >= 2) {
auto poss_define = string_fragment::from_str_range(
this->p_pattern, cap.sf_begin - 2, cap.sf_end);
if (poss_define == DEFINE_SF) {
cap_in_progress.pop_back();
continue;
}
}
if (first == '?') {
if (second == '\'') {
is_cap = true;
}
if (second == '<'
&& (isalpha(third) || third == '_'))
{
is_cap = true;
}
if (second == 'P' && third == '<') {
is_cap = true;
}
} else if (first != '*') {
is_cap = true;
}
if (is_cap) {
retval.emplace_back(cap);
}
cap_in_progress.pop_back();
}
break;
}
}
}
}
assert((size_t) this->get_capture_count() == retval.size());
return retval;
}
std::string
code::replace(string_fragment str, const char* repl) const
{
std::string retval;
std::string::size_type start = 0;
string_fragment remaining = str;
auto md = this->create_match_data();
while (remaining.is_valid()) {
auto find_res = this->capture_from(str)
.at(remaining)
.into(md)
.matches()
.ignore_error();
if (!find_res) {
break;
}
auto all = find_res->f_all;
remaining = find_res->f_remaining;
bool in_escape = false;
retval.append(str.data(), start, (all.sf_begin - start));
start = all.sf_end;
for (int lpc = 0; repl[lpc]; lpc++) {
auto ch = repl[lpc];
if (in_escape) {
if (isdigit(ch)) {
auto capture_index = (ch - '0');
if (capture_index < md.get_count()) {
auto cap = md[capture_index];
if (cap) {
retval.append(cap->data(), cap->length());
}
} else if (capture_index > this->get_capture_count()) {
retval.push_back('\\');
retval.push_back(ch);
}
} else {
if (ch != '\\') {
retval.push_back('\\');
}
retval.push_back(ch);
}
in_escape = false;
} else {
switch (ch) {
case '\\':
in_escape = true;
break;
default:
retval.push_back(ch);
break;
}
}
}
}
if (remaining.is_valid()) {
retval.append(str.data(), remaining.sf_begin, std::string::npos);
}
return retval;
}
int
code::name_index(const char* name) const
{
return pcre2_substring_number_from_name(this->p_code.in(),
(PCRE2_SPTR) name);
}
size_t
code::named_capture::get_index() const
{
return (this->nc_entry[0] << 8) | (this->nc_entry[1] & 0xff);
}
string_fragment
code::named_capture::get_name() const
{
return string_fragment::from_bytes(
&this->nc_entry[2], strlen((const char*) &this->nc_entry[2]));
}
code::named_capture
code::named_captures::iterator::operator*() const
{
return code::named_capture{this->i_entry};
}
code::named_captures::iterator&
code::named_captures::iterator::operator++()
{
this->i_entry += this->i_entry_size;
return *this;
}
bool
code::named_captures::iterator::operator==(const iterator& other) const
{
return this->i_entry == other.i_entry
&& this->i_entry_size == other.i_entry_size;
}
bool
code::named_captures::iterator::operator!=(const iterator& other) const
{
return this->i_entry != other.i_entry
|| this->i_entry_size != other.i_entry_size;
}
code::named_captures::iterator
code::named_captures::begin() const
{
return iterator{this->nc_entry_size, this->nc_name_table};
}
code::named_captures::iterator
code::named_captures::end() const
{
return iterator{
this->nc_entry_size,
this->nc_name_table + (this->nc_count * this->nc_entry_size),
};
}
matcher::matches_result
matcher::matches(uint32_t options)
{
this->mb_input.i_offset = this->mb_input.i_next_offset;
if (this->mb_input.i_offset == -1) {
return not_found{};
}
auto rc = pcre2_match(this->mb_code.p_code.in(),
this->mb_input.i_string.udata(),
this->mb_input.i_string.length(),
this->mb_input.i_offset,
options,
this->mb_match_data.md_data.in(),
nullptr);
if (rc > 0) {
this->mb_match_data.md_input = this->mb_input;
this->mb_match_data.md_code = &this->mb_code;
this->mb_match_data.md_capture_end = rc;
if (this->mb_match_data[0]->empty()
&& this->mb_match_data[0]->sf_end >= this->mb_input.i_string.sf_end)
{
this->mb_input.i_next_offset = -1;
} else if (this->mb_match_data[0]->empty()) {
this->mb_input.i_next_offset = this->mb_match_data[0]->sf_end + 1;
} else {
this->mb_input.i_next_offset = this->mb_match_data[0]->sf_end;
}
this->mb_match_data.md_input.i_next_offset
= this->mb_input.i_next_offset;
return found{
this->mb_match_data[0].value(),
this->mb_match_data.remaining(),
};
}
this->mb_match_data.md_input = this->mb_input;
this->mb_match_data.md_ovector[0] = this->mb_input.i_offset;
this->mb_match_data.md_ovector[1] = this->mb_input.i_offset;
this->mb_match_data.md_capture_end = 1;
if (rc == PCRE2_ERROR_NOMATCH) {
return not_found{};
}
return error{&this->mb_code, rc};
}
void
matcher::matches_result::handle_error(matcher::error err)
{
unsigned char buffer[1024];
pcre2_get_error_message(err.e_error_code, buffer, sizeof(buffer));
// log_error("pcre2_match failure: %s", buffer);
}
std::string
compile_error::get_message() const
{
unsigned char buffer[1024];
pcre2_get_error_message(this->ce_code, buffer, sizeof(buffer));
return {(const char*) buffer};
}
std::string
matcher::error::get_message()
{
unsigned char buffer[1024];
pcre2_get_error_message(this->e_error_code, buffer, sizeof(buffer));
return {(const char*) buffer};
}
} // namespace pcre2pp
} // namespace lnav