mirror of
https://git.openwrt.org/feed/packages.git
synced 2024-06-18 13:23:57 +02:00
3aa746b246
In converting nginx-util to PCRE2, it was wrongly dropped saving the
results of the regex match causing segmentation fault when used.
Add the missing code to correctly store the vector of the results from
the regex.
Fixes: b738e42c4d
("nginx-util: move to pcre2")
Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
510 lines
16 KiB
C++
510 lines
16 KiB
C++
#ifndef __REGEXP_PCRE_HPP
|
|
#define __REGEXP_PCRE_HPP
|
|
|
|
#define PCRE2_CODE_UNIT_WIDTH 8
|
|
|
|
#include <pcre2.h>
|
|
#include <array>
|
|
#include <stdexcept>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
namespace rgx {
|
|
/* partially implement the std::regex interface using PCRE for performance
|
|
* (=> pass "match" as non-const reference)
|
|
*/
|
|
|
|
namespace regex_constants {
|
|
enum error_type {
|
|
_enum_error_collate,
|
|
_enum_error_ctype,
|
|
_enum_error_escape,
|
|
_enum_error_backref,
|
|
_enum_error_brack,
|
|
_enum_error_paren,
|
|
_enum_error_brace,
|
|
_enum_error_badbrace,
|
|
_enum_error_range,
|
|
_enum_error_space,
|
|
_enum_error_badrepeat,
|
|
_enum_error_complexity,
|
|
_enum_error_stack,
|
|
_enum_error_last
|
|
};
|
|
static const error_type error_collate(_enum_error_collate);
|
|
static const error_type error_ctype(_enum_error_ctype);
|
|
static const error_type error_escape(_enum_error_escape);
|
|
static const error_type error_backref(_enum_error_backref);
|
|
static const error_type error_brack(_enum_error_brack);
|
|
static const error_type error_paren(_enum_error_paren);
|
|
static const error_type error_brace(_enum_error_brace);
|
|
static const error_type error_badbrace(_enum_error_badbrace);
|
|
static const error_type error_range(_enum_error_range);
|
|
static const error_type error_space(_enum_error_space);
|
|
static const error_type error_badrepeat(_enum_error_badrepeat);
|
|
static const error_type error_complexity(_enum_error_complexity);
|
|
static const error_type error_stack(_enum_error_stack);
|
|
} // namespace regex_constants
|
|
|
|
class regex_error : public std::runtime_error {
|
|
private:
|
|
regex_constants::error_type errcode;
|
|
|
|
public:
|
|
explicit regex_error(regex_constants::error_type code, const char* what = "regex error")
|
|
: runtime_error(what), errcode(code)
|
|
{}
|
|
|
|
[[nodiscard]] auto virtual code() const -> regex_constants::error_type;
|
|
};
|
|
|
|
[[nodiscard]] auto regex_error::code() const -> regex_constants::error_type
|
|
{
|
|
return errcode;
|
|
}
|
|
|
|
class regex {
|
|
private:
|
|
int errcode = 0;
|
|
|
|
PCRE2_SIZE erroffset = 0;
|
|
|
|
pcre2_code* const re = nullptr;
|
|
|
|
static const std::array<regex_constants::error_type, 86> errcode_pcre2regex;
|
|
|
|
static const auto BASE = 10;
|
|
|
|
public:
|
|
inline regex() = default;
|
|
|
|
inline regex(const regex&) = delete;
|
|
|
|
inline regex(regex&&) = default;
|
|
|
|
inline auto operator=(const regex&) -> regex& = delete;
|
|
|
|
inline auto operator=(regex &&) -> regex& = delete;
|
|
|
|
explicit regex(const std::string& str) : regex(str.c_str()) {}
|
|
|
|
explicit regex(const char* const str)
|
|
: re{pcre2_compile((PCRE2_SPTR)str, PCRE2_ZERO_TERMINATED, 0, &errcode, &erroffset, nullptr)}
|
|
{
|
|
if (re == nullptr) {
|
|
std::vector<PCRE2_UCHAR> buffer(256);
|
|
int errlen;
|
|
|
|
errlen = pcre2_get_error_message(errcode, buffer.data(), buffer.size());
|
|
if (errlen < 0)
|
|
throw regex_error(errcode_pcre2regex.at(errlen));
|
|
|
|
std::string what = std::string("regex error: ") +
|
|
std::string(buffer.data(), buffer.data() + errlen) + '\n';
|
|
what += " '" + std::string{str} + "'\n";
|
|
what += " " + std::string(erroffset, ' ') + '^';
|
|
|
|
throw regex_error(errcode_pcre2regex.at(errcode), what.c_str());
|
|
}
|
|
}
|
|
|
|
~regex()
|
|
{
|
|
if (re != nullptr) {
|
|
pcre2_code_free(re);
|
|
}
|
|
}
|
|
|
|
inline auto operator()() const -> const pcre2_code*
|
|
{
|
|
return re;
|
|
}
|
|
};
|
|
|
|
class smatch {
|
|
friend auto regex_search(std::string::const_iterator begin,
|
|
std::string::const_iterator end,
|
|
smatch& match, // NOLINT(google-runtime-references)
|
|
const regex& rgx); // match std::regex interface.
|
|
|
|
private:
|
|
std::string::const_iterator begin;
|
|
|
|
std::string::const_iterator end;
|
|
|
|
std::vector<PCRE2_SIZE> vec{};
|
|
|
|
int n = 0;
|
|
|
|
public:
|
|
[[nodiscard]] inline auto position(int i = 0) const
|
|
{
|
|
return (i < 0 || i >= n) ? std::string::npos : vec[2 * i];
|
|
}
|
|
|
|
[[nodiscard]] inline auto length(int i = 0) const
|
|
{
|
|
return (i < 0 || i >= n) ? 0 : vec[2 * i + 1] - vec[2 * i];
|
|
}
|
|
|
|
[[nodiscard]] auto str(int i = 0) const -> std::string
|
|
{ // should we throw?
|
|
if (i < 0 || i >= n) {
|
|
return "";
|
|
}
|
|
int x = vec[2 * i];
|
|
if (x < 0) {
|
|
return "";
|
|
}
|
|
int y = vec[2 * i + 1];
|
|
return std::string{begin + x, begin + y};
|
|
}
|
|
|
|
[[nodiscard]] auto format(const std::string& fmt) const;
|
|
|
|
[[nodiscard]] auto size() const -> int
|
|
{
|
|
return n;
|
|
}
|
|
|
|
[[nodiscard]] inline auto empty() const
|
|
{
|
|
return n < 0;
|
|
}
|
|
|
|
[[nodiscard]] inline auto ready() const
|
|
{
|
|
return !vec.empty();
|
|
}
|
|
};
|
|
|
|
inline auto regex_search(const std::string& subj, const regex& rgx);
|
|
|
|
auto regex_replace(const std::string& subj, const regex& rgx, const std::string& insert);
|
|
|
|
inline auto regex_search(const std::string& subj,
|
|
smatch& match, // NOLINT(google-runtime-references)
|
|
const regex& rgx); // match std::regex interface.
|
|
|
|
auto regex_search(std::string::const_iterator begin,
|
|
std::string::const_iterator end,
|
|
smatch& match, // NOLINT(google-runtime-references)
|
|
const regex& rgx); // match std::regex interface.
|
|
|
|
// ------------------------- implementation: ----------------------------------
|
|
|
|
inline auto regex_search(const std::string& subj, const regex& rgx)
|
|
{
|
|
pcre2_match_data *match_data;
|
|
|
|
if (rgx() == nullptr) {
|
|
throw std::runtime_error("regex_search error: no regex given");
|
|
}
|
|
|
|
match_data = pcre2_match_data_create_from_pattern(rgx(), NULL);
|
|
|
|
int n =
|
|
pcre2_match(rgx(), (PCRE2_SPTR)subj.c_str(), static_cast<int>(subj.length()), 0, 0, match_data, nullptr);
|
|
|
|
pcre2_match_data_free(match_data);
|
|
|
|
return n >= 0;
|
|
}
|
|
|
|
auto regex_search(const std::string::const_iterator begin,
|
|
const std::string::const_iterator end,
|
|
smatch& match,
|
|
const regex& rgx)
|
|
{
|
|
if (rgx() == nullptr) {
|
|
throw std::runtime_error("regex_search error: no regex given");
|
|
}
|
|
|
|
int sz = 0;
|
|
pcre2_pattern_info(rgx(), PCRE2_INFO_CAPTURECOUNT, &sz);
|
|
sz = 3 * (sz + 1);
|
|
|
|
match.vec.reserve(sz);
|
|
|
|
const char* subj = &*begin;
|
|
int n, len = static_cast<int>(&*end - subj);
|
|
unsigned int ov_count;
|
|
PCRE2_SIZE *ov;
|
|
|
|
match.begin = begin;
|
|
match.end = end;
|
|
|
|
pcre2_match_data *match_data = pcre2_match_data_create(sz, NULL);
|
|
n = pcre2_match(rgx(), (PCRE2_SPTR)subj, len, 0, 0, match_data, NULL);
|
|
ov = pcre2_get_ovector_pointer(match_data);
|
|
ov_count = pcre2_get_ovector_count(match_data);
|
|
|
|
match.vec.assign(ov, ov + ov_count);
|
|
match.n = n;
|
|
|
|
pcre2_match_data_free(match_data);
|
|
|
|
if (match.n < 0) {
|
|
return false;
|
|
}
|
|
if (match.n == 0) {
|
|
match.n = sz / 3;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
inline auto regex_search(const std::string& subj, smatch& match, const regex& rgx)
|
|
{
|
|
return regex_search(subj.begin(), subj.end(), match, rgx);
|
|
}
|
|
|
|
auto smatch::format(const std::string& fmt) const
|
|
{
|
|
std::string ret{};
|
|
size_t index = 0;
|
|
|
|
size_t pos = 0;
|
|
while ((pos = fmt.find('$', index)) != std::string::npos) {
|
|
ret.append(fmt, index, pos - index);
|
|
index = pos + 1;
|
|
|
|
char chr = fmt[index++];
|
|
switch (chr) {
|
|
case '&': // match
|
|
ret += str(0);
|
|
break;
|
|
|
|
case '`': // prefix
|
|
ret.append(begin, begin + vec[0]);
|
|
break;
|
|
|
|
case '\'': // suffix
|
|
ret.append(begin + vec[1], end);
|
|
break;
|
|
|
|
default:
|
|
if (isdigit(chr) != 0) { // one or two digits => submatch:
|
|
int num = chr - '0';
|
|
chr = fmt[index];
|
|
if (isdigit(chr) != 0) { // second digit:
|
|
++index;
|
|
static const auto base = 10;
|
|
num = num * base + chr - '0';
|
|
}
|
|
ret += str(num);
|
|
break;
|
|
} // else:
|
|
|
|
ret += '$';
|
|
[[fallthrough]];
|
|
|
|
case '$': // escaped
|
|
ret += chr;
|
|
}
|
|
}
|
|
ret.append(fmt, index);
|
|
return ret;
|
|
}
|
|
|
|
auto regex_replace(const std::string& subj, const regex& rgx, const std::string& insert)
|
|
{
|
|
if (rgx() == nullptr) {
|
|
throw std::runtime_error("regex_replace error: no regex given");
|
|
}
|
|
|
|
std::string ret{};
|
|
auto pos = subj.begin();
|
|
|
|
for (smatch match; regex_search(pos, subj.end(), match, rgx);
|
|
pos += match.position(0) + match.length(0))
|
|
{
|
|
ret.append(pos, pos + match.position(0));
|
|
ret.append(match.format(insert));
|
|
}
|
|
|
|
ret.append(pos, subj.end());
|
|
return ret;
|
|
}
|
|
|
|
// ------------ There is only the translation table below : -------------------
|
|
|
|
const std::array<regex_constants::error_type, 86> regex::errcode_pcre2regex = {
|
|
// 0 no error
|
|
regex_constants::error_type::_enum_error_last,
|
|
// 1 \ at end of pattern
|
|
regex_constants::error_escape,
|
|
// 2 \c at end of pattern
|
|
regex_constants::error_escape,
|
|
// 3 unrecognized character follows \ .
|
|
regex_constants::error_escape,
|
|
// 4 numbers out of order in {} quantifier
|
|
regex_constants::error_badbrace,
|
|
// 5 number too big in {} quantifier
|
|
regex_constants::error_badbrace,
|
|
// 6 missing terminating for character class
|
|
regex_constants::error_brack,
|
|
// 7 invalid escape sequence in character class
|
|
regex_constants::error_escape,
|
|
// 8 range out of order in character class
|
|
regex_constants::error_range,
|
|
// 9 nothing to repeat
|
|
regex_constants::error_badrepeat,
|
|
// 10 [this code is not in use
|
|
regex_constants::error_type::_enum_error_last,
|
|
// 11 internal error: unexpected repeat
|
|
regex_constants::error_badrepeat,
|
|
// 12 unrecognized character after (? or (?-
|
|
regex_constants::error_backref,
|
|
// 13 POSIX named classes are supported only within a class
|
|
regex_constants::error_range,
|
|
// 14 missing )
|
|
regex_constants::error_paren,
|
|
// 15 reference to non-existent subpattern
|
|
regex_constants::error_backref,
|
|
// 16 erroffset passed as NULL
|
|
regex_constants::error_type::_enum_error_last,
|
|
// 17 unknown option bit(s) set
|
|
regex_constants::error_type::_enum_error_last,
|
|
// 18 missing ) after comment
|
|
regex_constants::error_paren,
|
|
// 19 [this code is not in use
|
|
regex_constants::error_type::_enum_error_last,
|
|
// 20 regular expression is too large
|
|
regex_constants::error_space,
|
|
// 21 failed to get memory
|
|
regex_constants::error_stack,
|
|
// 22 unmatched parentheses
|
|
regex_constants::error_paren,
|
|
// 23 internal error: code overflow
|
|
regex_constants::error_stack,
|
|
// 24 unrecognized character after (?<
|
|
regex_constants::error_backref,
|
|
// 25 lookbehind assertion is not fixed length
|
|
regex_constants::error_backref,
|
|
// 26 malformed number or name after (?(
|
|
regex_constants::error_backref,
|
|
// 27 conditional group contains more than two branches
|
|
regex_constants::error_backref,
|
|
// 28 assertion expected after (?(
|
|
regex_constants::error_backref,
|
|
// 29 (?R or (?[+-digits must be followed by )
|
|
regex_constants::error_backref,
|
|
// 30 unknown POSIX class name
|
|
regex_constants::error_ctype,
|
|
// 31 POSIX collating elements are not supported
|
|
regex_constants::error_collate,
|
|
// 32 this version of PCRE is compiled without UTF support
|
|
regex_constants::error_collate,
|
|
// 33 [this code is not in use
|
|
regex_constants::error_type::_enum_error_last,
|
|
// 34 character value in \x{} or \o{} is too large
|
|
regex_constants::error_escape,
|
|
// 35 invalid condition (?(0)
|
|
regex_constants::error_backref,
|
|
// 36 \C not allowed in lookbehind assertion
|
|
regex_constants::error_escape,
|
|
// 37 PCRE does not support \L, \l, \N{name}, \U, or \u
|
|
regex_constants::error_escape,
|
|
// 38 number after (?C is > 255
|
|
regex_constants::error_backref,
|
|
// 39 closing ) for (?C expected
|
|
regex_constants::error_paren,
|
|
// 40 recursive call could loop indefinitely
|
|
regex_constants::error_complexity,
|
|
// 41 unrecognized character after (?P
|
|
regex_constants::error_backref,
|
|
// 42 syntax error in subpattern name (missing terminator)
|
|
regex_constants::error_paren,
|
|
// 43 two named subpatterns have the same name
|
|
regex_constants::error_backref,
|
|
// 44 invalid UTF-8 string (specifically UTF-8)
|
|
regex_constants::error_collate,
|
|
// 45 support for \P, \p, and \X has not been compiled
|
|
regex_constants::error_escape,
|
|
// 46 malformed \P or \p sequence
|
|
regex_constants::error_escape,
|
|
// 47 unknown property name after \P or \p
|
|
regex_constants::error_escape,
|
|
// 48 subpattern name is too long (maximum 32 characters)
|
|
regex_constants::error_backref,
|
|
// 49 too many named subpatterns (maximum 10000)
|
|
regex_constants::error_complexity,
|
|
// 50 [this code is not in use
|
|
regex_constants::error_type::_enum_error_last,
|
|
// 51 octal value is greater than \377 in 8-bit non-UTF-8 mode
|
|
regex_constants::error_escape,
|
|
// 52 internal error: overran compiling workspace
|
|
regex_constants::error_type::_enum_error_last,
|
|
// 53 internal error: previously-checked referenced subpattern not found
|
|
regex_constants::error_type::_enum_error_last,
|
|
// 54 DEFINE group contains more than one branch
|
|
regex_constants::error_backref,
|
|
// 55 repeating a DEFINE group is not allowed
|
|
regex_constants::error_backref,
|
|
// 56 inconsistent NEWLINE options
|
|
regex_constants::error_escape,
|
|
// 57 \g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain
|
|
// number
|
|
regex_constants::error_backref,
|
|
// 58 a numbered reference must not be zero
|
|
regex_constants::error_backref,
|
|
// 59 an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)
|
|
regex_constants::error_complexity,
|
|
// 60 (*VERB) not recognized or malformed
|
|
regex_constants::error_complexity,
|
|
// 61 number is too big
|
|
regex_constants::error_complexity,
|
|
// 62 subpattern name expected
|
|
regex_constants::error_backref,
|
|
// 63 digit expected after (?+
|
|
regex_constants::error_backref,
|
|
// 64 is an invalid data character in JavaScript compatibility mode
|
|
regex_constants::error_escape,
|
|
// 65 different names for subpatterns of the same number are not allowed
|
|
regex_constants::error_backref,
|
|
// 66 (*MARK) must have an argument
|
|
regex_constants::error_complexity,
|
|
// 67 this version of PCRE is not compiled with Unicode property support
|
|
regex_constants::error_collate,
|
|
// 68 \c must be followed by an ASCII character
|
|
regex_constants::error_escape,
|
|
// 69 \k is not followed by a braced, angle-bracketed, or quoted name
|
|
regex_constants::error_backref,
|
|
// 70 internal error: unknown opcode in find_fixedlength()
|
|
regex_constants::error_type::_enum_error_last,
|
|
// 71 \N is not supported in a class
|
|
regex_constants::error_ctype,
|
|
// 72 too many forward references
|
|
regex_constants::error_backref,
|
|
// 73 disallowed Unicode code point (>= 0xd800 && <= 0xdfff)
|
|
regex_constants::error_escape,
|
|
// 74 invalid UTF-16 string (specifically UTF-16)
|
|
regex_constants::error_collate,
|
|
// 75 name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)
|
|
regex_constants::error_complexity,
|
|
// 76 character value in \u.... sequence is too large
|
|
regex_constants::error_escape,
|
|
// 77 invalid UTF-32 string (specifically UTF-32)
|
|
regex_constants::error_collate,
|
|
// 78 setting UTF is disabled by the application
|
|
regex_constants::error_collate,
|
|
// 79 non-hex character in \x{} (closing brace missing?)
|
|
regex_constants::error_escape,
|
|
// 80 non-octal character in \o{} (closing brace missing?)
|
|
regex_constants::error_escape,
|
|
// 81 missing opening brace after \o
|
|
regex_constants::error_brace,
|
|
// 82 parentheses are too deeply nested
|
|
regex_constants::error_complexity,
|
|
// 83 invalid range in character class
|
|
regex_constants::error_range,
|
|
// 84 group name must start with a non-digit
|
|
regex_constants::error_backref,
|
|
// 85 parentheses are too deeply nested (stack check)
|
|
regex_constants::error_stack};
|
|
|
|
} // namespace rgx
|
|
|
|
#endif
|