/** * Copyright (c) 2022, Timothy Stack * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * Neither the name of Timothy Stack nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * @file pcrepp.cc */ #include "pcre2pp.hh" #include "config.h" namespace lnav { namespace pcre2pp { std::string quote(const char* unquoted) { std::string retval; for (int lpc = 0; unquoted[lpc]; lpc++) { if (isalnum(unquoted[lpc]) || unquoted[lpc] == '_' || unquoted[lpc] & 0x80) { retval.push_back(unquoted[lpc]); } else { retval.push_back('\\'); retval.push_back(unquoted[lpc]); } } return retval; } match_data code::create_match_data() const { auto_mem md(pcre2_match_data_free); md = pcre2_match_data_create_from_pattern(this->p_code, nullptr); return match_data{std::move(md)}; } Result code::from(string_fragment sf, int options) { compile_error ce; auto_mem co(pcre2_code_free); options |= PCRE2_UTF; co = pcre2_compile( sf.udata(), sf.length(), options, &ce.ce_code, &ce.ce_offset, nullptr); if (co == nullptr) { ce.ce_pattern = sf.to_string(); return Err(ce); } auto jit_rc = pcre2_jit_compile(co, PCRE2_JIT_COMPLETE); if (jit_rc < 0) { // log_error("failed to JIT compile pattern: %d", jit_rc); } return Ok(code{std::move(co), sf.to_string()}); } code::named_captures code::get_named_captures() const { named_captures retval; pcre2_pattern_info( this->p_code.in(), PCRE2_INFO_NAMECOUNT, &retval.nc_count); pcre2_pattern_info( this->p_code.in(), PCRE2_INFO_NAMEENTRYSIZE, &retval.nc_entry_size); pcre2_pattern_info( this->p_code.in(), PCRE2_INFO_NAMETABLE, &retval.nc_name_table); return retval; } size_t code::match_partial(string_fragment in) const { auto md = this->create_match_data(); auto length = in.length(); do { auto rc = pcre2_match(this->p_code.in(), in.udata(), length, 0, PCRE2_PARTIAL_HARD, md.md_data.in(), nullptr); if (rc == PCRE2_ERROR_PARTIAL) { return md.md_ovector[1]; } if (length > 0) { length -= 1; } } while (length > 0); return 0; } const char* code::get_name_for_capture(size_t index) const { for (const auto cap : this->get_named_captures()) { if (cap.get_index() == index) { return cap.get_name().data(); } } return nullptr; } size_t code::get_capture_count() const { uint32_t retval; pcre2_pattern_info(this->p_code.in(), PCRE2_INFO_CAPTURECOUNT, &retval); return retval; } std::vector code::get_captures() const { bool in_class = false, in_escape = false, in_literal = false; auto pat_frag = string_fragment::from_str(this->p_pattern); std::vector cap_in_progress; std::vector retval; for (int lpc = 0; this->p_pattern[lpc]; lpc++) { if (in_escape) { in_escape = false; if (this->p_pattern[lpc] == 'Q') { in_literal = true; } } else if (in_class) { if (this->p_pattern[lpc] == ']') { in_class = false; } if (this->p_pattern[lpc] == '\\') { in_escape = true; } } else if (in_literal) { if (this->p_pattern[lpc] == '\\' && this->p_pattern[lpc + 1] == 'E') { in_literal = false; lpc += 1; } } else { switch (this->p_pattern[lpc]) { case '\\': in_escape = true; break; case '[': in_class = true; break; case '(': cap_in_progress.emplace_back(pat_frag.sub_range(lpc, lpc)); break; case ')': { if (!cap_in_progress.empty()) { static const auto DEFINE_SF = string_fragment::from_const("(?(DEFINE)"); auto& cap = cap_in_progress.back(); char first = '\0', second = '\0', third = '\0'; bool is_cap = false; cap.sf_end = lpc + 1; if (cap.length() >= 2) { first = this->p_pattern[cap.sf_begin + 1]; } if (cap.length() >= 3) { second = this->p_pattern[cap.sf_begin + 2]; } if (cap.length() >= 4) { third = this->p_pattern[cap.sf_begin + 3]; } if (cap.sf_begin >= 2) { auto poss_define = string_fragment::from_str_range( this->p_pattern, cap.sf_begin - 2, cap.sf_end); if (poss_define == DEFINE_SF) { cap_in_progress.pop_back(); continue; } } if (first == '?') { if (second == '\'') { is_cap = true; } if (second == '<' && (isalpha(third) || third == '_')) { is_cap = true; } if (second == 'P' && third == '<') { is_cap = true; } } else if (first != '*') { is_cap = true; } if (is_cap) { retval.emplace_back(cap); } cap_in_progress.pop_back(); } break; } } } } assert((size_t) this->get_capture_count() == retval.size()); return retval; } std::string code::replace(string_fragment str, const char* repl) const { std::string retval; std::string::size_type start = 0; string_fragment remaining = str; auto md = this->create_match_data(); while (remaining.is_valid()) { auto find_res = this->capture_from(str) .at(remaining) .into(md) .matches() .ignore_error(); if (!find_res) { break; } auto all = find_res->f_all; remaining = find_res->f_remaining; bool in_escape = false; retval.append(str.data(), start, (all.sf_begin - start)); start = all.sf_end; for (int lpc = 0; repl[lpc]; lpc++) { auto ch = repl[lpc]; if (in_escape) { if (isdigit(ch)) { auto capture_index = (ch - '0'); if (capture_index < md.get_count()) { auto cap = md[capture_index]; if (cap) { retval.append(cap->data(), cap->length()); } } else if (capture_index > this->get_capture_count()) { retval.push_back('\\'); retval.push_back(ch); } } else { if (ch != '\\') { retval.push_back('\\'); } retval.push_back(ch); } in_escape = false; } else { switch (ch) { case '\\': in_escape = true; break; default: retval.push_back(ch); break; } } } } if (remaining.is_valid()) { retval.append(str.data(), remaining.sf_begin, std::string::npos); } return retval; } int code::name_index(const char* name) const { return pcre2_substring_number_from_name(this->p_code.in(), (PCRE2_SPTR) name); } size_t code::named_capture::get_index() const { return (this->nc_entry[0] << 8) | (this->nc_entry[1] & 0xff); } string_fragment code::named_capture::get_name() const { return string_fragment::from_bytes( &this->nc_entry[2], strlen((const char*) &this->nc_entry[2])); } code::named_capture code::named_captures::iterator::operator*() const { return code::named_capture{this->i_entry}; } code::named_captures::iterator& code::named_captures::iterator::operator++() { this->i_entry += this->i_entry_size; return *this; } bool code::named_captures::iterator::operator==(const iterator& other) const { return this->i_entry == other.i_entry && this->i_entry_size == other.i_entry_size; } bool code::named_captures::iterator::operator!=(const iterator& other) const { return this->i_entry != other.i_entry || this->i_entry_size != other.i_entry_size; } code::named_captures::iterator code::named_captures::begin() const { return iterator{this->nc_entry_size, this->nc_name_table}; } code::named_captures::iterator code::named_captures::end() const { return iterator{ this->nc_entry_size, this->nc_name_table + (this->nc_count * this->nc_entry_size), }; } matcher::matches_result matcher::matches(uint32_t options) { this->mb_input.i_offset = this->mb_input.i_next_offset; if (this->mb_input.i_offset == -1) { return not_found{}; } auto rc = pcre2_match(this->mb_code.p_code.in(), this->mb_input.i_string.udata(), this->mb_input.i_string.length(), this->mb_input.i_offset, options, this->mb_match_data.md_data.in(), nullptr); if (rc > 0) { this->mb_match_data.md_input = this->mb_input; this->mb_match_data.md_code = &this->mb_code; this->mb_match_data.md_capture_end = rc; if (this->mb_match_data[0]->empty() && this->mb_match_data[0]->sf_end >= this->mb_input.i_string.sf_end) { this->mb_input.i_next_offset = -1; } else if (this->mb_match_data[0]->empty()) { this->mb_input.i_next_offset = this->mb_match_data[0]->sf_end + 1; } else { this->mb_input.i_next_offset = this->mb_match_data[0]->sf_end; } this->mb_match_data.md_input.i_next_offset = this->mb_input.i_next_offset; return found{ this->mb_match_data[0].value(), this->mb_match_data.remaining(), }; } this->mb_match_data.md_input = this->mb_input; this->mb_match_data.md_ovector[0] = this->mb_input.i_offset; this->mb_match_data.md_ovector[1] = this->mb_input.i_offset; this->mb_match_data.md_capture_end = 1; if (rc == PCRE2_ERROR_NOMATCH) { return not_found{}; } return error{&this->mb_code, rc}; } void matcher::matches_result::handle_error(matcher::error err) { unsigned char buffer[1024]; pcre2_get_error_message(err.e_error_code, buffer, sizeof(buffer)); // log_error("pcre2_match failure: %s", buffer); } std::string compile_error::get_message() const { unsigned char buffer[1024]; pcre2_get_error_message(this->ce_code, buffer, sizeof(buffer)); return {(const char*) buffer}; } std::string matcher::error::get_message() { unsigned char buffer[1024]; pcre2_get_error_message(this->e_error_code, buffer, sizeof(buffer)); return {(const char*) buffer}; } } // namespace pcre2pp } // namespace lnav