From d11a76349745576534fda87dd43d77d803a9bd86 Mon Sep 17 00:00:00 2001 From: Rot127 Date: Wed, 13 Dec 2023 18:02:54 -0500 Subject: [PATCH] Replace OpenBSD regex library with PCRE2. PCRE2 has way better performance than the OpenBSD library (something around 20 times faster). The following flags are enabled for every pattern: - PCRE2_UTF - PCRE2_MATCH_INVALID_UTF - PCRE2_NO_UTF_CHECK All the others are optional. Changes made: - Adds PCRE2 as subproject. - Changes the API away from POSIX to PCRE2. - Edits many regex patterns because: - ' ' is skipped in patterns, if the EXTENDED flag is set for matching. '\s' must be set now. - '.' doesn't match newlines by default. - Changes the API so matches and their groups are bundled into PVectors. - Moves the regex component to rz_util. --- .gitignore | 1 + .reuse/dep5 | 5 - binrz/rz-test/run.c | 17 +- binrz/rz-test/rz-test.c | 5 +- librz/asm/arch/hexagon/hexagon_arch.c | 8 +- librz/asm/asm.c | 27 +- librz/asm/p/asm_bf.c | 6 +- librz/asm/p/asm_hexagon.c | 26 +- librz/cons/less.c | 23 +- librz/cons/pager.c | 64 +- librz/cons/pager_private.h | 9 +- librz/core/casm.c | 8 +- librz/core/cbin.c | 2 +- librz/core/cmd/cmd_debug.c | 3 +- librz/core/cmd/cmd_search.c | 5 +- librz/core/core.c | 16 +- librz/include/meson.build | 2 +- librz/include/rz_regex.h | 77 - librz/include/rz_util.h | 2 +- librz/include/rz_util/rz_regex.h | 82 + librz/magic/file.h | 2 +- librz/magic/softmagic.c | 82 +- librz/parse/filter.c | 10 +- librz/parse/p/parse_arm_pseudo.c | 55 +- librz/parse/p/parse_mips_pseudo.c | 54 +- librz/parse/p/parse_x86_pseudo.c | 58 +- librz/reg/profile.c | 6 +- librz/search/regexp.c | 38 +- librz/util/list.c | 4 + librz/util/meson.build | 8 +- librz/util/print.c | 2 +- librz/util/regex.c | 411 +++++ librz/util/regex/COPYRIGHT | 54 - librz/util/regex/README | 5 - librz/util/regex/cclass.h | 70 - librz/util/regex/cname.h | 139 -- librz/util/regex/engine.c | 1076 ------------ librz/util/regex/re_format.7 | 756 --------- librz/util/regex/regcomp.c | 1786 -------------------- librz/util/regex/regerror.c | 132 -- librz/util/regex/regex.3 | 667 -------- librz/util/regex/regex2.h | 158 -- librz/util/regex/regexec.c | 174 -- librz/util/regex/test.c | 55 - librz/util/regex/utils.h | 62 - librz/util/str.c | 27 +- meson.build | 24 + meson_options.txt | 1 + subprojects/packagefiles/pcre2/meson.build | 85 + subprojects/pcre2.wrap | 8 + test/db/archos/darwin-arm64/dbg | 128 +- test/db/archos/darwin-x64/dbg | 84 +- test/db/archos/linux-x64/dbg_dmh | 18 +- test/db/archos/linux-x64/dbg_oo | 5 +- test/db/archos/linux-x64/dbg_step | 4 +- test/db/archos/linux-x64/dbg_trace | 4 +- test/db/archos/windows-x64/dbg_dts | 2 +- test/db/cmd/cmd_http_post | 2 +- test/db/cmd/cmd_pd2 | 4 +- test/db/formats/pdb | 2 +- test/db/formats/pyc | 38 +- test/unit/test_regex.c | 173 +- test/unit/test_str.c | 4 +- 63 files changed, 1196 insertions(+), 5669 deletions(-) delete mode 100644 librz/include/rz_regex.h create mode 100644 librz/include/rz_util/rz_regex.h create mode 100644 librz/util/regex.c delete mode 100644 librz/util/regex/COPYRIGHT delete mode 100644 librz/util/regex/README delete mode 100644 librz/util/regex/cclass.h delete mode 100644 librz/util/regex/cname.h delete mode 100644 librz/util/regex/engine.c delete mode 100644 librz/util/regex/re_format.7 delete mode 100644 librz/util/regex/regcomp.c delete mode 100644 librz/util/regex/regerror.c delete mode 100644 librz/util/regex/regex.3 delete mode 100644 librz/util/regex/regex2.h delete mode 100644 librz/util/regex/regexec.c delete mode 100644 librz/util/regex/test.c delete mode 100644 librz/util/regex/utils.h create mode 100644 subprojects/packagefiles/pcre2/meson.build create mode 100644 subprojects/pcre2.wrap diff --git a/.gitignore b/.gitignore index f499c34b615..6ea09b4dad2 100644 --- a/.gitignore +++ b/.gitignore @@ -117,6 +117,7 @@ peda-session-* .cache/ test/.tmp/* subprojects/capstone-*/ +subprojects/pcre2/ subprojects/libzip-*/ subprojects/lz4-*/ subprojects/packagecache/ diff --git a/.reuse/dep5 b/.reuse/dep5 index a472729220e..bcf65d0f3d8 100644 --- a/.reuse/dep5 +++ b/.reuse/dep5 @@ -188,11 +188,6 @@ Copyright: 1986-1995 Ian F. Darwin 1995-present Christos Zoulas and others License: BSD-2-Clause -Files: librz/util/regex/* -Copyright: 1992, 1993, 1994 Henry Spencer - 1992, 1993, 1994 The Regents of the University of California -License: BSD-3-Clause - Files: subprojects/rzheap/rz_jemalloc/* Copyright: 2002-present Jason Evans 2007-2012 Mozilla Foundation. diff --git a/binrz/rz-test/run.c b/binrz/rz-test/run.c index d27cfd26626..a48b10333c8 100644 --- a/binrz/rz-test/run.c +++ b/binrz/rz-test/run.c @@ -2,6 +2,8 @@ // SPDX-License-Identifier: LGPL-3.0-only #include "rz_test.h" +#include +#include #include #if __WINDOWS__ @@ -193,11 +195,16 @@ RZ_API RzSubprocessOutput *rz_test_run_cmd_test(RzTestRunConfig *config, RzCmdTe RZ_API bool rz_test_cmp_cmd_output(const char *output, const char *expect, const char *regexp) { if (regexp) { - RzList *matches = rz_regex_get_match_list(regexp, "e", output); - const char *match = rz_list_to_str(matches, '\0'); - bool equal = (0 == strcmp(expect, match)); - rz_list_free(matches); - RZ_FREE(match); + RzStrBuf *match_str = rz_regex_full_match_str(regexp, output, RZ_REGEX_ZERO_TERMINATED, RZ_REGEX_EXTENDED, RZ_REGEX_DEFAULT, "\n"); + bool equal = false; + ut32 expect_len = strlen(expect); + if (expect_len > 0 && expect[expect_len - 1] == '\n') { + // Ignore newline + equal = (rz_str_cmp(expect, rz_strbuf_get(match_str), expect_len - 1) == 0); + } else { + equal = RZ_STR_EQ(expect, rz_strbuf_get(match_str)); + } + rz_strbuf_free(match_str); return equal; } return (0 == strcmp(expect, output)); diff --git a/binrz/rz-test/rz-test.c b/binrz/rz-test/rz-test.c index 88caddbe769..ca3ba88c38e 100644 --- a/binrz/rz-test/rz-test.c +++ b/binrz/rz-test/rz-test.c @@ -743,9 +743,8 @@ static void print_diff(const char *actual, const char *expected, const char *reg const char *output = actual; if (regexp) { - RzList *matches = rz_regex_get_match_list(regexp, "e", actual); - output = rz_list_to_str(matches, '\0'); - rz_list_free(matches); + RzStrBuf *match_str = rz_regex_full_match_str(regexp, actual, RZ_REGEX_ZERO_TERMINATED, RZ_REGEX_EXTENDED, RZ_REGEX_DEFAULT, "\n"); + output = rz_strbuf_drain(match_str); } d = rz_diff_lines_new(expected, output, NULL); diff --git a/librz/asm/arch/hexagon/hexagon_arch.c b/librz/asm/arch/hexagon/hexagon_arch.c index ece2396cb06..0d0dbf28c84 100644 --- a/librz/asm/arch/hexagon/hexagon_arch.c +++ b/librz/asm/arch/hexagon/hexagon_arch.c @@ -888,13 +888,17 @@ RZ_API void hexagon_reverse_opcode(const RzAsm *rz_asm, HexReversedOpcode *rz_re memcpy(rz_reverse->ana_op, &hic->ana_op, sizeof(RzAnalysisOp)); rz_strbuf_set(&rz_reverse->asm_op->buf_asm, hic->text); rz_reverse->asm_op->asm_toks = rz_asm_tokenize_asm_regex(&rz_reverse->asm_op->buf_asm, state->token_patterns); - rz_reverse->asm_op->asm_toks->op_type = hic->ana_op.type; + if (rz_reverse->asm_op->asm_toks) { + rz_reverse->asm_op->asm_toks->op_type = hic->ana_op.type; + } break; case HEXAGON_DISAS: memcpy(rz_reverse->asm_op, &hic->asm_op, sizeof(RzAsmOp)); rz_strbuf_set(&rz_reverse->asm_op->buf_asm, hic->text); rz_reverse->asm_op->asm_toks = rz_asm_tokenize_asm_regex(&rz_reverse->asm_op->buf_asm, state->token_patterns); - rz_reverse->asm_op->asm_toks->op_type = hic->ana_op.type; + if (rz_reverse->asm_op->asm_toks) { + rz_reverse->asm_op->asm_toks->op_type = hic->ana_op.type; + } break; case HEXAGON_ANALYSIS: memcpy(rz_reverse->ana_op, &hic->ana_op, sizeof(RzAnalysisOp)); diff --git a/librz/asm/asm.c b/librz/asm/asm.c index e7931366f37..956abfc5a96 100644 --- a/librz/asm/asm.c +++ b/librz/asm/asm.c @@ -5,7 +5,7 @@ #include "rz_util/rz_print.h" #include #include -#include +#include #include #include #include @@ -1545,7 +1545,7 @@ RZ_API void rz_asm_compile_token_patterns(RZ_INOUT RzPVector /*regex) { - pat->regex = rz_regex_new(pat->pattern, "e"); + pat->regex = rz_regex_new(pat->pattern, RZ_REGEX_EXTENDED, 0); if (!pat->regex) { RZ_LOG_WARN("Did not compile regex pattern %s.\n", pat->pattern); rz_warn_if_reached(); @@ -1584,32 +1584,31 @@ RZ_API RZ_OWN RzAsmTokenString *rz_asm_tokenize_asm_regex(RZ_BORROW RzStrBuf *as } } - /// Start pattern search from the beginning - size_t asm_str_off = 0; - // Search for token pattern. - RzRegexMatch match[1]; - while (rz_regex_exec(pattern->regex, asm_str + asm_str_off, 1, match, 0) == 0) { - st64 match_start = match[0].rm_so; // Token start - st64 match_end = match[0].rm_eo; // Token end - st64 len = match_end - match_start; // Length of token - st64 tok_offset = asm_str_off + match_start; // Token offset in str + RzPVector *match_sets = rz_regex_match_all(pattern->regex, asm_str, RZ_REGEX_ZERO_TERMINATED, 0, RZ_REGEX_DEFAULT); + void **grouped_match; + rz_pvector_foreach (match_sets, grouped_match) { + if (rz_pvector_empty(*grouped_match)) { + continue; + } + RzRegexMatch *match = rz_pvector_at(*grouped_match, 0); + st64 match_start = match->start; // Token start + st64 len = match->len; // Length of token + st64 tok_offset = match_start; // Token offset in str if (overlaps_with_token(toks->tokens, tok_offset, tok_offset + len - 1)) { // If this is true a token with higher priority was matched before. - asm_str_off = tok_offset + len; continue; } // New token found, add it. if (!is_num(asm_str + tok_offset)) { add_token(toks, tok_offset, len, pattern->type, 0); - asm_str_off = tok_offset + len; continue; } ut64 number = strtoull(asm_str + tok_offset, NULL, 0); add_token(toks, tok_offset, len, pattern->type, number); - asm_str_off = tok_offset + len; } + rz_pvector_free(match_sets); } rz_vector_sort(toks->tokens, (RzVectorComparator)cmp_tokens, false, NULL); diff --git a/librz/asm/p/asm_bf.c b/librz/asm/p/asm_bf.c index 3e271e73004..7a784a185e7 100644 --- a/librz/asm/p/asm_bf.c +++ b/librz/asm/p/asm_bf.c @@ -30,14 +30,14 @@ static RZ_OWN RzPVector /**/ *get_token_patterns(RzAsm *a) pat = RZ_NEW0(RzAsmTokenPattern); pat->type = RZ_ASM_TOKEN_REGISTER; pat->pattern = strdup( - "(ptr)"); + "ptr"); rz_pvector_push(pvec, pat); // reference pattern pat = RZ_NEW0(RzAsmTokenPattern); pat->type = RZ_ASM_TOKEN_OPERATOR; pat->pattern = strdup( - "(\\[)|(\\])" // Matches a single bracket + "\\[|\\]" // Matches a single bracket ); rz_pvector_push(pvec, pat); @@ -45,7 +45,7 @@ static RZ_OWN RzPVector /**/ *get_token_patterns(RzAsm *a) pat = RZ_NEW0(RzAsmTokenPattern); pat->type = RZ_ASM_TOKEN_SEPARATOR; pat->pattern = strdup( - "([[:blank:]]+)"); + "\\s+"); rz_pvector_push(pvec, pat); return pvec; diff --git a/librz/asm/p/asm_hexagon.c b/librz/asm/p/asm_hexagon.c index e0506c4117f..e8db9add945 100644 --- a/librz/asm/p/asm_hexagon.c +++ b/librz/asm/p/asm_hexagon.c @@ -30,15 +30,15 @@ static RZ_OWN RzPVector /**/ *get_token_patterns(HexState * RzAsmTokenPattern *pat = RZ_NEW0(RzAsmTokenPattern); pat->type = RZ_ASM_TOKEN_META; pat->pattern = strdup( - "(^[\\[\\?\\/\\|\\\\\\{])|(┌)|(│)|(└)|" // Packet prefix - "((∎)|[<\\}])([ :])(endloop[01]{1,2})" // Endloop markers + "^[\\[\\?\\/\\|\\\\\\{┌│└]|" // Packet prefix + "(∎|[<\\}])[\\s:]endloop[01]{1,2}" // Endloop markers ); rz_pvector_push(pvec, pat); pat = RZ_NEW0(RzAsmTokenPattern); pat->type = RZ_ASM_TOKEN_META; pat->pattern = strdup( - "(#{1,2})|(\\}$)|" // Immediate prefix, Closing packet bracket + "\\#{1,2}|\\}$|" // Immediate prefix, Closing packet bracket "\\.new|:n?t|:raw|" // .new and jump hints ); rz_pvector_push(pvec, pat); @@ -46,7 +46,7 @@ static RZ_OWN RzPVector /**/ *get_token_patterns(HexState * pat = RZ_NEW0(RzAsmTokenPattern); pat->type = RZ_ASM_TOKEN_REGISTER; pat->pattern = strdup( - "([CNPRMQVO][[:digit:]]{1,2}(:[[:digit:]]{1,2})?(in)?)" // Registers and double registers + "[CNPRMQVO]\\d{1,2}(:\\d{1,2})?(in)?" // Registers and double registers ); rz_pvector_push(pvec, pat); @@ -60,51 +60,51 @@ static RZ_OWN RzPVector /**/ *get_token_patterns(HexState * pat = RZ_NEW0(RzAsmTokenPattern); pat->type = RZ_ASM_TOKEN_NUMBER; pat->pattern = strdup( - "(0x[[:digit:]abcdef]+)" // Hexadecimal numbers + "0x(\\d|[abcdef])+" // Hexadecimal numbers ); rz_pvector_push(pvec, pat); pat = RZ_NEW0(RzAsmTokenPattern); pat->type = RZ_ASM_TOKEN_MNEMONIC; pat->pattern = strdup( - "([[:alpha:]]+[[:digit:]]+[[:alpha:]]*)" // Mnemonics with a decimal number in the name. + "[a-zA-Z]+\\d+[a-zA-Z]*" // Mnemonics with a decimal number in the name. ); rz_pvector_push(pvec, pat); pat = RZ_NEW0(RzAsmTokenPattern); pat->type = RZ_ASM_TOKEN_NUMBER; pat->pattern = strdup( - "([[:digit:]]+)" // Decimal numbers + "\\d+" // Decimal numbers ); rz_pvector_push(pvec, pat); pat = RZ_NEW0(RzAsmTokenPattern); pat->type = RZ_ASM_TOKEN_SEPARATOR; pat->pattern = strdup( - "([[:blank:]]+)|" // Spaces and tabs - "([,;\\.\\(\\)\\{\\}:])" // Brackets and others + "\\s+|" // Spaces and tabs + "[,;\\.\\(\\)\\{\\}:]" // Brackets and others ); rz_pvector_push(pvec, pat); pat = RZ_NEW0(RzAsmTokenPattern); pat->type = RZ_ASM_TOKEN_OPERATOR; pat->pattern = strdup( - "(\\+)|(=)|(!)|(-)" // +,-,=,],[, ! (not the packet prefix) + "[\\+=!-]" // +,-,=,],[, ! (not the packet prefix) ); rz_pvector_push(pvec, pat); pat = RZ_NEW0(RzAsmTokenPattern); pat->type = RZ_ASM_TOKEN_OPERATOR; pat->pattern = strdup( - "(\\])|(\\[|<{1,2}|>{1,2})" // +,-,=,],[, ! (not the packet prefix) + "\\]|\\[|<{1,2}|>{1,2}" // +,-,=,],[, ! (not the packet prefix) ); rz_pvector_push(pvec, pat); pat = RZ_NEW0(RzAsmTokenPattern); pat->type = RZ_ASM_TOKEN_MNEMONIC; pat->pattern = strdup( - "([[:alnum:]]+)|" // Alphanumeric mnemonics - "([[:alnum:]]+_[[:alnum:]]+)" // Menmonics with "_" e.g dealloc_return + "\\w+_\\w+|" // Menmonics with "_" e.g dealloc_return + "\\w+" // Alphanumeric mnemonics ); rz_pvector_push(pvec, pat); diff --git a/librz/cons/less.c b/librz/cons/less.c index 1127d8def5f..de89fbd5912 100644 --- a/librz/cons/less.c +++ b/librz/cons/less.c @@ -3,9 +3,10 @@ // SPDX-License-Identifier: LGPL-3.0-only #include -#include +#include #include #include "pager_private.h" +#include "rz_vector.h" #define I(x) rz_cons_singleton()->x @@ -31,7 +32,7 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) { RzRegex *rx = NULL; int w, h, ch, to, ui = 1, from = 0, i; const char *sreg; - RzList **mla; + RzPVector **mla; // rcons kills str after flushing the buffer, so we must keep a copy char *ostr = strdup(str); @@ -47,7 +48,7 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) { if (lines_count < 1) { mla = NULL; } else { - mla = calloc(lines_count, sizeof(RzList *)); + mla = calloc(lines_count, sizeof(RzPVector *)); if (!mla) { free(p); free(ostr); @@ -55,9 +56,6 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) { return 0; } } - for (i = 0; i < lines_count; i++) { - mla[i] = rz_list_new(); - } rz_cons_set_raw(true); rz_cons_show_cursor(false); rz_cons_reset(); @@ -75,7 +73,7 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) { ch = rz_cons_readchar(); if (exitkeys && strchr(exitkeys, ch)) { for (i = 0; i < lines_count; i++) { - rz_list_free(mla[i]); + rz_pvector_free(mla[i]); } free(p); free(mla); @@ -129,7 +127,7 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) { if (rx) { rz_regex_free(rx); } - rx = rz_regex_new(sreg, ""); + rx = rz_regex_new(sreg, RZ_REGEX_EXTENDED | RZ_REGEX_MULTILINE, 0); } else { /* we got an empty string */ from = pager_next_match(from, mla, lines_count); break; @@ -138,9 +136,12 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) { break; } /* find all occurrences */ - if (pager_all_matches(p, rx, mla, lines, lines_count)) { - from = pager_next_match(from, mla, lines_count); + RzPVector *matches = rz_regex_match_all_not_grouped(rx, str, RZ_REGEX_ZERO_TERMINATED, 0, RZ_REGEX_DEFAULT); + if (rz_pvector_empty(matches)) { + rz_pvector_free(matches); + break; } + from = pager_next_match(from, mla, lines_count); break; case 'n': /* next match */ /* search already performed */ @@ -157,7 +158,7 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) { } } for (i = 0; i < lines_count; i++) { - rz_list_free(mla[i]); + rz_pvector_free(mla[i]); } free(mla); rz_regex_free(rx); diff --git a/librz/cons/pager.c b/librz/cons/pager.c index 545cc98ea30..c6c09c2384e 100644 --- a/librz/cons/pager.c +++ b/librz/cons/pager.c @@ -1,16 +1,15 @@ // SPDX-FileCopyrightText: 2019 pancake // SPDX-License-Identifier: LGPL-3.0-only -#include +#include #include #include #include "pager_private.h" +#include "rz_vector.h" -RZ_IPI void pager_color_line(const char *line, RzStrpool *p, RzList /**/ *ml) { +RZ_IPI void pager_color_line(const char *line, RzStrpool *p, RzPVector /**/ *ml) { int m_len, offset = 0; char *m_addr; - RzListIter *it; - RzRegexMatch *m; char *inv[2] = { RZ_CONS_INVERT(true, true), RZ_CONS_INVERT(false, true) @@ -20,15 +19,17 @@ RZ_IPI void pager_color_line(const char *line, RzStrpool *p, RzList /*rm_so - offset); + rz_strpool_memcat(p, line + offset, m->start - offset); rz_strpool_memcat(p, inv[0], linv[0]); - m_len = m->rm_eo - m->rm_so; + m_len = m->len; if (m_len < 0) { m_len = 0; } - m_addr = rz_str_ndup(line + m->rm_so, m_len); + m_addr = rz_str_ndup(line + m->start, m_len); if (m_addr) { /* in case there's a CSI in the middle of this match*/ m_len = rz_str_ansi_filter(m_addr, NULL, NULL, m_len); @@ -37,7 +38,7 @@ RZ_IPI void pager_color_line(const char *line, RzStrpool *p, RzList /*rm_eo; + offset = m->start + m->len; free(m_addr); } } @@ -45,7 +46,7 @@ RZ_IPI void pager_color_line(const char *line, RzStrpool *p, RzList /**/ **mla, int from, int to, int w) { +RZ_IPI void pager_printpage(const char *line, int *index, RzPVector /**/ **mla, int from, int to, int w) { int i; rz_cons_clear00(); @@ -71,68 +72,33 @@ RZ_IPI void pager_printpage(const char *line, int *index, RzList /**/ **mla, int lcount) { +RZ_IPI int pager_next_match(int from, RzPVector /**/ **mla, int lcount) { int l; if (from > lcount - 2) { return from; } for (l = from + 1; l < lcount; l++) { /* if there's at least one match on the line */ - if (rz_list_first(mla[l])) { + if (!rz_pvector_empty(mla[l])) { return l; } } return from; } -RZ_IPI int pager_prev_match(int from, RzList /**/ **mla) { +RZ_IPI int pager_prev_match(int from, RzPVector /**/ **mla) { int l; if (from < 1) { return from; } for (l = from - 1; l > 0; l--) { - if (rz_list_first(mla[l])) { + if (!rz_pvector_empty(mla[l])) { return l; } } return from; } -RZ_IPI bool pager_all_matches(const char *s, RzRegex *rx, RzList /**/ **mla, int *lines, int lcount) { - bool res = false; - RzRegexMatch m = { 0 }; - int l, slen; - for (l = 0; l < lcount; l++) { - m.rm_so = 0; - const char *loff = s + lines[l]; /* current line offset */ - char *clean = strdup(loff); - if (!clean) { - return false; - } - int *cpos = NULL; - int ncpos = rz_str_ansi_filter(clean, NULL, &cpos, -1); - m.rm_eo = slen = strlen(clean); - rz_list_purge(mla[l]); - while (!rz_regex_exec(rx, clean, 1, &m, RZ_REGEX_STARTEND)) { - if (!cpos || m.rm_so >= ncpos) { - break; - } - RzRegexMatch *ms = RZ_NEW0(RzRegexMatch); - if (ms && cpos) { - ms->rm_so = cpos[m.rm_so]; - ms->rm_eo = cpos[m.rm_eo]; - rz_list_append(mla[l], ms); - } - m.rm_so = m.rm_eo; - m.rm_eo = slen; - res = true; - } - free(cpos); - free(clean); - } - return res; -} - RZ_IPI int *pager_splitlines(char *s, int *lines_count) { int lines_size = 128; int *lines = NULL; diff --git a/librz/cons/pager_private.h b/librz/cons/pager_private.h index 5c2c8b79e36..749735eda1d 100644 --- a/librz/cons/pager_private.h +++ b/librz/cons/pager_private.h @@ -4,11 +4,10 @@ #ifndef PAGER_PRIVATE_H #define PAGER_PRIVATE_H -RZ_IPI void pager_color_line(const char *line, RzStrpool *p, RzList /**/ *ml); -RZ_IPI void pager_printpage(const char *line, int *index, RzList /**/ **mla, int from, int to, int w); -RZ_IPI int pager_next_match(int from, RzList /**/ **mla, int lcount); -RZ_IPI int pager_prev_match(int from, RzList /**/ **mla); -RZ_IPI bool pager_all_matches(const char *s, RzRegex *rx, RzList /**/ **mla, int *lines, int lcount); +RZ_IPI void pager_color_line(const char *line, RzStrpool *p, RzPVector /**/ *ml); +RZ_IPI void pager_printpage(const char *line, int *index, RzPVector /**/ **mla, int from, int to, int w); +RZ_IPI int pager_next_match(int from, RzPVector /**/ **mla, int lcount); +RZ_IPI int pager_prev_match(int from, RzPVector /**/ **mla); RZ_IPI int *pager_splitlines(char *s, int *lines_count); #endif diff --git a/librz/core/casm.c b/librz/core/casm.c index a3d3a62335c..405c001f5ae 100644 --- a/librz/core/casm.c +++ b/librz/core/casm.c @@ -2,6 +2,8 @@ // SPDX-FileCopyrightText: 2009-2019 pancake // SPDX-License-Identifier: LGPL-3.0-only +#include +#include #include #include #include @@ -342,9 +344,11 @@ RZ_API RzList /**/ *rz_core_asm_strsearch(RzCore *core, const ch } else if (!regexp) { matches = strstr(opst, tokens[matchcount]) != NULL; } else { - rx = rz_regex_new(tokens[matchcount], "es"); - matches = rz_regex_exec(rx, opst, 0, 0, 0) == 0; + rx = rz_regex_new(tokens[matchcount], RZ_REGEX_EXTENDED, 0); + RzPVector *tmp_m = rz_regex_match_first(rx, opst, RZ_REGEX_ZERO_TERMINATED, 0, RZ_REGEX_DEFAULT); + matches = (!rz_pvector_empty(tmp_m) && tmp_m != NULL) ? 1 : 0; rz_regex_free(rx); + rz_pvector_free(tmp_m); } } if (align && align > 1) { diff --git a/librz/core/cbin.c b/librz/core/cbin.c index 39cbc63f39d..87c5314c934 100644 --- a/librz/core/cbin.c +++ b/librz/core/cbin.c @@ -3524,7 +3524,7 @@ static void classdump_objc(RzBinClass *c) { RzBinClassField *f; RzBinSymbol *sym; rz_list_foreach (c->fields, iter2, f) { - if (f->name && rz_regex_match("ivar", "e", f->name)) { + if (f->name && strstr("ivar", f->name)) { rz_cons_printf(" %s %s\n", f->type, f->name); } } diff --git a/librz/core/cmd/cmd_debug.c b/librz/core/cmd/cmd_debug.c index 8fc8c29a840..ba970c79f64 100644 --- a/librz/core/cmd/cmd_debug.c +++ b/librz/core/cmd/cmd_debug.c @@ -1,6 +1,7 @@ // SPDX-FileCopyrightText: 2009-2020 pancake // SPDX-License-Identifier: LGPL-3.0-only +#include #include #include #include @@ -315,7 +316,7 @@ static bool step_until_inst(RzCore *core, const char *instr, bool regex) { if (ret > 0) { const char *buf_asm = rz_asm_op_get_asm(&asmop); if (regex) { - if (rz_regex_match(instr, "e", buf_asm)) { + if (rz_regex_contains(instr, buf_asm, RZ_REGEX_ZERO_TERMINATED, RZ_REGEX_EXTENDED, RZ_REGEX_DEFAULT)) { RZ_LOG_ERROR("core: esil: stop.\n"); break; } diff --git a/librz/core/cmd/cmd_search.c b/librz/core/cmd/cmd_search.c index 37488355ff5..47cb9dc2e47 100644 --- a/librz/core/cmd/cmd_search.c +++ b/librz/core/cmd/cmd_search.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "../core_private.h" @@ -1080,8 +1081,8 @@ static RzList /**/ *construct_rop_gadget(RzCore *core, ut64 addr idx += opsz; addr += opsz; if (rx) { - grep_find = !rz_regex_match(rx, "e", opst); - search_hit = (end && grep && (grep_find < 1)); + grep_find = rz_regex_contains(rx, opst, RZ_REGEX_ZERO_TERMINATED, RZ_REGEX_EXTENDED, RZ_REGEX_DEFAULT); + search_hit = (end && grep && grep_find); } else { search_hit = (end && grep && strstr(opst, grep_str)); } diff --git a/librz/core/core.c b/librz/core/core.c index 9bca0ef3d2a..b88a881b59d 100644 --- a/librz/core/core.c +++ b/librz/core/core.c @@ -1,6 +1,8 @@ // SPDX-FileCopyrightText: 2009-2020 pancake // SPDX-License-Identifier: LGPL-3.0-only +#include +#include #include #include #include @@ -1377,17 +1379,17 @@ static void autocomplete_theme(RzCore *core, RzLineCompletion *completion, const static bool find_e_opts(RzCore *core, RzLineCompletion *completion, RzLineBuffer *buf) { const char *pattern = "e (.*)="; - RzRegex *rx = rz_regex_new(pattern, "e"); - const size_t nmatch = 2; - RzRegexMatch pmatch[2] = { 0 }; + RzRegex *rx = rz_regex_new(pattern, RZ_REGEX_EXTENDED, 0); bool ret = false; - if (rz_regex_exec(rx, buf->data, nmatch, pmatch, 1)) { + RzPVector *matches = rz_regex_match_all_not_grouped(rx, buf->data, buf->length, 0, RZ_REGEX_DEFAULT); + if (!matches || rz_pvector_empty(matches) || rz_pvector_len(matches) < 2) { goto out; } int i; char *str = NULL, *sp; - for (i = pmatch[1].rm_so; i < pmatch[1].rm_eo; i++) { + RzRegexMatch *m1 = rz_pvector_at(matches, 1); + for (i = m1->start; i < m1->start + m1->len; i++) { str = rz_str_appendch(str, buf->data[i]); } if (!str) { @@ -1403,7 +1405,8 @@ static bool find_e_opts(RzCore *core, RzLineCompletion *completion, RzLineBuffer *sp = ' '; } if (!node) { - return false; + ret = false; + goto out; } RzListIter *iter; char *option; @@ -1420,6 +1423,7 @@ static bool find_e_opts(RzCore *core, RzLineCompletion *completion, RzLineBuffer out: rz_regex_free(rx); + rz_pvector_free(matches); return ret; } diff --git a/librz/include/meson.build b/librz/include/meson.build index e3be2161099..5d18f64b548 100644 --- a/librz/include/meson.build +++ b/librz/include/meson.build @@ -38,7 +38,6 @@ include_files = [ 'rz_platform.h', 'rz_project.h', 'rz_reg.h', - 'rz_regex.h', 'rz_search.h', 'rz_sign.h', 'rz_skiplist.h', @@ -96,6 +95,7 @@ rz_util_files = [ 'rz_util/rz_punycode.h', 'rz_util/rz_range.h', 'rz_util/rz_rbtree.h', + 'rz_util/rz_regex.h', 'rz_util/rz_serialize.h', 'rz_util/rz_signal.h', 'rz_util/rz_spaces.h', diff --git a/librz/include/rz_regex.h b/librz/include/rz_regex.h deleted file mode 100644 index 0b56cec8f2a..00000000000 --- a/librz/include/rz_regex.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef RZ_REGEX_H -#define RZ_REGEX_H - -#include -#include -#include - -typedef struct rz_regex_t { - int re_magic; - size_t re_nsub; /* number of parenthesized subexpressions */ - const char *re_endp; /* end pointer for RZ_REGEX_PEND */ - struct re_guts *re_g; /* none of your business :-) */ - int re_flags; -} RzRegex; - -typedef struct rz_regmatch_t { - st64 rm_so; /* start of match */ - st64 rm_eo; /* end of match */ -} RzRegexMatch; - -/* regcomp() flags */ -#define RZ_REGEX_BASIC 0000 -#define RZ_REGEX_EXTENDED 0001 -#define RZ_REGEX_ICASE 0002 -#define RZ_REGEX_NOSUB 0004 -#define RZ_REGEX_NEWLINE 0010 -#define RZ_REGEX_NOSPEC 0020 -#define RZ_REGEX_PEND 0040 -#define RZ_REGEX_DUMP 0200 - -/* regerror() flags */ -#define RZ_REGEX_ENOSYS (-1) /* Reserved */ -#define RZ_REGEX_NOMATCH 1 -#define RZ_REGEX_BADPAT 2 -#define RZ_REGEX_ECOLLATE 3 -#define RZ_REGEX_ECTYPE 4 -#define RZ_REGEX_EESCAPE 5 -#define RZ_REGEX_ESUBREG 6 -#define RZ_REGEX_EBRACK 7 -#define RZ_REGEX_EPAREN 8 -#define RZ_REGEX_EBRACE 9 -#define RZ_REGEX_BADBR 10 -#define RZ_REGEX_ERANGE 11 -#define RZ_REGEX_ESPACE 12 -#define RZ_REGEX_BADRPT 13 -#define RZ_REGEX_EMPTY 14 -#define RZ_REGEX_ASSERT 15 -#define RZ_REGEX_INVARG 16 -#define RZ_REGEX_ILLSEQ 17 -#define RZ_REGEX_ATOI 255 /* convert name to number (!) */ -#define RZ_REGEX_ITOA 0400 /* convert number to name (!) */ - -/* regexec() flags */ -#define RZ_REGEX_NOTBOL 00001 -#define RZ_REGEX_NOTEOL 00002 -#define RZ_REGEX_STARTEND 00004 -#define RZ_REGEX_TRACE 00400 /* tracing of execution */ -#define RZ_REGEX_LARGE 01000 /* force large representation */ -#define RZ_REGEX_BACKR 02000 /* force use of backref code */ - -RZ_API RzRegex *rz_regex_new(const char *pattern, const char *cflags); -RZ_API int rz_regex_match(const char *pattern, const char *flags, const char *text); -RZ_API char *rz_regex_match_extract(RZ_NONNULL const char *str, RZ_NONNULL RzRegexMatch *match); -RZ_API RzList /**/ *rz_regex_get_match_list(const char *pattern, const char *flags, const char *text); -RZ_API int rz_regex_flags(const char *flags); -RZ_API int rz_regex_comp(RzRegex *, const char *, int); -RZ_API size_t rz_regex_error(int, const RzRegex *, char *, size_t); -/* - * gcc under c99 mode won't compile "[]" by itself. As a workaround, - * a dummy argument name is added. - */ -RZ_API bool rz_regex_check(const RzRegex *rr, const char *str); -RZ_API int rz_regex_exec(const RzRegex *preg, const char *string, size_t nmatch, RzRegexMatch __pmatch[], int eflags); -RZ_API void rz_regex_free(RzRegex *); -RZ_API void rz_regex_fini(RzRegex *); - -#endif /* !_REGEX_H_ */ diff --git a/librz/include/rz_util.h b/librz/include/rz_util.h index efb802be707..c6ba827508e 100644 --- a/librz/include/rz_util.h +++ b/librz/include/rz_util.h @@ -6,7 +6,6 @@ #include #include -#include #include #include // rizin linked list #include // skiplist @@ -50,6 +49,7 @@ #include "rz_util/rz_panels.h" #include "rz_util/rz_punycode.h" #include "rz_util/rz_range.h" +#include "rz_util/rz_regex.h" #include "rz_util/rz_signal.h" #include "rz_util/rz_spaces.h" #include "rz_util/rz_stack.h" diff --git a/librz/include/rz_util/rz_regex.h b/librz/include/rz_util/rz_regex.h new file mode 100644 index 00000000000..8aeae28a9c1 --- /dev/null +++ b/librz/include/rz_util/rz_regex.h @@ -0,0 +1,82 @@ +// SPDX-FileCopyrightText: 2023 Rot127 +// SPDX-License-Identifier: LGPL-3.0-only + +#ifndef RZ_REGEX_H +#define RZ_REGEX_H + +#include +#include +#include +#include +#include + +#define RZ_REGEX_SIZE size_t + +// Some basic PCRE2 macros. There are way more defined +// and should be added here if needed. +#define RZ_REGEX_ERROR_NOMATCH (-1) /* PCRE2_ERROR_NOMATCH */ +#define RZ_REGEX_ERROR_PARTIAL (-2) /* PCRE2_ERROR_PARTIAL */ + +#define RZ_REGEX_DEFAULT 0 +#define RZ_REGEX_CASELESS 0x00000008u /* PCRE2_CASELESS */ +#define RZ_REGEX_EXTENDED 0x00000080u /* PCRE2_EXTENDED */ +#define RZ_REGEX_EXTENDED_MORE 0x01000000u /* PCRE2_EXTENDED_MORE */ +#define RZ_REGEX_MULTILINE 0x00000400u /* PCRE2_MULTILINE */ + +#define RZ_REGEX_JIT_PARTIAL_SOFT 0x00000002u /* PCRE2_JIT_PARTIAL_SOFT */ +#define RZ_REGEX_JIT_PARTIAL_HARD 0x00000004u /* PCRE2_JIT_PARTIAL_HARD */ + +#define RZ_REGEX_PARTIAL_SOFT 0x00000010u /* PCRE2_PARTIAL_SOFT */ +#define RZ_REGEX_PARTIAL_HARD 0x00000020u /* PCRE2_PARTIAL_HARD */ + +#define RZ_REGEX_UNSET (~(RZ_REGEX_SIZE)0) /* PCRE2_UNSET */ +#define RZ_REGEX_ZERO_TERMINATED (~(RZ_REGEX_SIZE)0) /* PCRE2_ZERO_TERMINATED */ + +typedef int RzRegexStatus; ///< An status number returned by the regex API. +typedef size_t RzRegexSize; ///< Size of a text or regex. This is the size measured in code width. For UTF-8: bytes. +typedef ut32 RzRegexFlags; ///< Regex flag bits. +typedef uint8_t *RzRegexPattern; ///< A regex pattern string. +typedef void RzRegex; ///< A regex expression. + +typedef struct { + RzRegexSize group_idx; ///< Index of the group. Used to determine name if any was given. + RzRegexSize start; ///< Start offset into the text where the match starts. + RzRegexSize len; ///< Length of match in bytes. +} RzRegexMatch; + +typedef void RzRegexMatchData; ///< PCRE2 internal match data type + +RZ_API RZ_OWN RzRegex *rz_regex_new(RZ_NONNULL const char *pattern, RzRegexFlags cflags, RzRegexFlags jflags); +RZ_API void rz_regex_free(RZ_OWN RzRegex *regex); +RZ_API void rz_regex_error_msg(RzRegexStatus errcode, RZ_OUT char *errbuf, RzRegexSize errbuf_size); +RZ_API const ut8 *rz_regex_get_match_name(RZ_NONNULL const RzRegex *regex, ut32 name_idx); +RZ_API RzRegexStatus rz_regex_match(RZ_NONNULL const RzRegex *regex, RZ_NONNULL const char *text, + RzRegexSize text_size, + RzRegexSize text_offset, + RzRegexFlags mflags); +RZ_API RZ_OWN RzPVector /**/ *rz_regex_match_all_not_grouped( + RZ_NONNULL const RzRegex *regex, + RZ_NONNULL const char *text, + RzRegexSize text_size, + RzRegexSize text_offset, + RzRegexFlags mflags); +RZ_API RZ_OWN RzPVector /**/ *rz_regex_match_first( + RZ_NONNULL const RzRegex *regex, + RZ_NONNULL const char *text, + RzRegexSize text_size, + RzRegexSize text_offset, + RzRegexFlags mflags); +RZ_API RZ_OWN RzPVector /* *>*/ *rz_regex_match_all( + RZ_NONNULL const RzRegex *regex, + RZ_NONNULL const char *text, + RzRegexSize text_size, + RzRegexSize text_offset, + RzRegexFlags mflags); +RZ_API bool rz_regex_contains(RZ_NONNULL const char *pattern, RZ_NONNULL const char *text, + RzRegexSize text_size, + RzRegexFlags cflags, RzRegexFlags mflags); +RZ_API RZ_OWN RzStrBuf *rz_regex_full_match_str(RZ_NONNULL const char *pattern, RZ_NONNULL const char *text, + RzRegexSize text_size, + RzRegexFlags cflags, RzRegexFlags mflags, RZ_NONNULL const char *separator); + +#endif /* RZ_REGEX_H */ diff --git a/librz/magic/file.h b/librz/magic/file.h index 92d3a0e7f97..92b6f499249 100644 --- a/librz/magic/file.h +++ b/librz/magic/file.h @@ -41,7 +41,7 @@ #include #include /* For open and flags */ #include // TODO: use utX -#include +#include #include /* Do this here and now, because struct stat gets re-defined on solaris */ #include diff --git a/librz/magic/softmagic.c b/librz/magic/softmagic.c index fa5fd89c0cd..829428a1c99 100644 --- a/librz/magic/softmagic.c +++ b/librz/magic/softmagic.c @@ -34,7 +34,7 @@ #if !USE_LIB_MAGIC #include "file.h" -#include "rz_regex.h" +#include #include #include #include @@ -274,24 +274,17 @@ static int match(RzMagic *ms, struct rz_magic *magic, ut32 nmagic, const ut8 *s, } static int check_fmt(RzMagic *ms, struct rz_magic *m) { - RzRegex rx; - int rc; - if (!strchr(RZ_MAGIC_DESC, '%')) { return 0; } - rc = rz_regex_comp(&rx, "%[-0-9\\.]*s", RZ_REGEX_EXTENDED | RZ_REGEX_NOSUB); - if (rc) { - char errmsg[512]; - rz_regex_error(rc, &rx, errmsg, sizeof(errmsg) - 1); - file_magerror(ms, "regex error %d, (%s)", rc, errmsg); + RzRegex *re = rz_regex_new("%[-0-9\\.]*s", RZ_REGEX_EXTENDED, 0); + if (!re) { return -1; - } else { - rc = rz_regex_exec(&rx, RZ_MAGIC_DESC, 0, 0, 0); - rz_regex_fini(&rx); - return !rc; } + RzRegexStatus rc = rz_regex_match(re, RZ_MAGIC_DESC, RZ_REGEX_ZERO_TERMINATED, 0, RZ_REGEX_DEFAULT); + rz_regex_free(re); + return rc > 0 ? 1 : 0; } char *strdupn(const char *str, size_t n) { @@ -1412,59 +1405,32 @@ static int magiccheck(RzMagic *ms, struct rz_magic *m) { break; } case FILE_REGEX: { - int rc; - RzRegex rx; - char errmsg[512]; - if (!ms->search.s) { return 0; } l = 0; - rc = rz_regex_comp(&rx, m->value.s, - RZ_REGEX_EXTENDED | RZ_REGEX_NEWLINE | - ((m->str_flags & STRING_IGNORE_CASE) ? RZ_REGEX_ICASE : 0)); - if (rc) { - (void)rz_regex_error(rc, &rx, errmsg, sizeof(errmsg) - 1); - file_magerror(ms, "regex error %d, (%s)", - rc, errmsg); - v = (ut64)-1; - } else { - RzRegexMatch pmatch[1]; -#ifndef RZ_REGEX_STARTEND -#define RZ_REGEX_STARTEND 0 - size_t l = ms->search.s_len - 1; - char c = ms->search.s[l]; - ((char *)(intptr_t)ms->search.s)[l] = '\0'; -#else - pmatch[0].rm_so = 0; - pmatch[0].rm_eo = ms->search.s_len; -#endif - rc = rz_regex_exec(&rx, (const char *)ms->search.s, 1, pmatch, RZ_REGEX_STARTEND); -#if RZ_REGEX_STARTEND == 0 - ((char *)(intptr_t)ms->search.s)[l] = c; -#endif - switch (rc) { - case 0: - ms->search.s += (int)pmatch[0].rm_so; - ms->search.offset += (size_t)pmatch[0].rm_so; - ms->search.rm_len = (size_t)(pmatch[0].rm_eo - pmatch[0].rm_so); - v = 0; - break; - case RZ_REGEX_NOMATCH: - v = 1; - break; - default: - (void)rz_regex_error(rc, &rx, errmsg, sizeof(errmsg) - 1); - file_magerror(ms, "regexec error %d, (%s)", rc, errmsg); - v = UT64_MAX; - break; - } - rz_regex_fini(&rx); + RzRegex *rx = rz_regex_new(m->value.s, + RZ_REGEX_EXTENDED | + ((m->str_flags & STRING_IGNORE_CASE) ? RZ_REGEX_CASELESS : 0), + 0); + if (!rx) { + return -1; } - if (v == (ut64)-1) { + RzPVector *matches = rz_regex_match_first(rx, (const char *)ms->search.s, RZ_REGEX_ZERO_TERMINATED, 0, RZ_REGEX_DEFAULT); + rz_regex_free(rx); + if (!matches) { return -1; + } else if (rz_pvector_len(matches) == 0) { + v = 1; + break; } + RzRegexMatch *m = rz_pvector_head(matches); + ms->search.s += (int)m->start; + ms->search.offset += (size_t)m->start; + ms->search.rm_len = (size_t)m->len; + rz_pvector_free(matches); + v = 0; break; } default: diff --git a/librz/parse/filter.c b/librz/parse/filter.c index fc774860e6a..040fdd57e88 100644 --- a/librz/parse/filter.c +++ b/librz/parse/filter.c @@ -4,7 +4,7 @@ // SPDX-License-Identifier: LGPL-3.0-only #include "rz_util/rz_str.h" -#include +#include #include #include @@ -179,7 +179,13 @@ static bool is_lea(const char *asm_str) { if (!colored) { return strlen(asm_str) > 4 && rz_str_startswith_icase(asm_str, "lea") && asm_str[3] == ' '; } - return rz_regex_match("(^\x1b\\[[[:digit:]]{1,3}mlea\x1b\\[0m.+)", "ei", asm_str) != RZ_REGEX_NOMATCH; + RzRegex *re = rz_regex_new("(^\x1b\\[\\d{1,3}mlea\x1b\\[0m.+)", RZ_REGEX_EXTENDED | RZ_REGEX_CASELESS, 0); + if (!re) { + return false; + } + bool res = rz_regex_match(re, asm_str, RZ_REGEX_ZERO_TERMINATED, 0, RZ_REGEX_DEFAULT) != RZ_REGEX_ERROR_NOMATCH; + rz_regex_free(re); + return res; } static bool filter(RzParse *p, ut64 addr, RzFlag *f, RzAnalysisHint *hint, char *data, char *str, int len, bool big_endian) { diff --git a/librz/parse/p/parse_arm_pseudo.c b/librz/parse/p/parse_arm_pseudo.c index 30ca4b2b8b0..9c54178cc75 100644 --- a/librz/parse/p/parse_arm_pseudo.c +++ b/librz/parse/p/parse_arm_pseudo.c @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include "parse_common.c" @@ -266,44 +268,34 @@ static char *subvar_stack(RzParse *p, RzAnalysisOp *op, RZ_NULLABLE RzAnalysisFu return tstr; } - RzRegex var_re; - if (rz_regex_comp(&var_re, re_str, RZ_REGEX_EXTENDED | RZ_REGEX_ICASE) != 0) { - rz_regex_fini(&var_re); + RzRegex *var_re = rz_regex_new(re_str, RZ_REGEX_EXTENDED | RZ_REGEX_CASELESS, 0); + if (!var_re) { return tstr; } - RzRegexMatch match[4] = { 0 }; - if (rz_regex_exec(&var_re, tstr, RZ_ARRAY_SIZE(match), match, 0) != 0) { - rz_regex_fini(&var_re); + RzPVector *matches = rz_regex_match_first(var_re, tstr, RZ_REGEX_ZERO_TERMINATED, 0, RZ_REGEX_DEFAULT); + if (!matches || rz_pvector_empty(matches)) { + rz_regex_free(var_re); + rz_pvector_free(matches); return tstr; } - for (size_t i = 0; i < RZ_ARRAY_SIZE(match); i++) { - char *s = rz_regex_match_extract(tstr, &match[i]); - free(s); - } - rz_regex_fini(&var_re); + rz_regex_free(var_re); - rz_return_val_if_fail(match[1].rm_so >= 0, tstr); - char *reg_str = rz_regex_match_extract(tstr, &match[1]); + RzRegexMatch *match = rz_pvector_at(matches, 1); + char *reg_str = rz_str_ndup(tstr + match->start, match->len); if (!reg_str) { + rz_pvector_free(matches); return tstr; } - if (!rz_str_casecmp(reg_str, "x29")) { - free(reg_str); - reg_str = strdup("fp"); - } - rz_return_val_if_fail(match[group_idx_addend].rm_so >= 0, tstr); - char *addend_str = rz_regex_match_extract(tstr, &match[group_idx_addend]); - if (!addend_str) { - free(reg_str); - return tstr; - } + rz_return_val_if_fail(rz_pvector_len(matches) >= group_idx_addend, tstr); + match = rz_pvector_at(matches, group_idx_addend); + const char *addend_str = tstr + match->start; st64 reg_addend = strtoll(addend_str, NULL, 0); - free(addend_str); if (group_idx_sign >= 0) { - rz_return_val_if_fail(match[group_idx_sign].rm_so >= 0, tstr); - char sign = tstr[match[group_idx_sign].rm_so]; + rz_return_val_if_fail(rz_pvector_len(matches) >= group_idx_sign, tstr); + match = rz_pvector_at(matches, group_idx_sign); + char sign = tstr[match->start]; if (sign == '-') { reg_addend = -reg_addend; } @@ -312,16 +304,18 @@ static char *subvar_stack(RzParse *p, RzAnalysisOp *op, RZ_NULLABLE RzAnalysisFu char *varstr = p->var_expr_for_reg_access(f, addr, reg_str, reg_addend); if (!varstr) { free(reg_str); + rz_pvector_free(matches); return tstr; } // replace! - size_t tail_len = strlen(tstr) - match[0].rm_eo; + RzRegexMatch *match_full = rz_pvector_at(matches, 0); + size_t tail_len = strlen(tstr) - (match_full->start + match_full->len); RzStrBuf sb; rz_strbuf_init(&sb); // reserve with a bit of padding for brackets, reg, whitespace, ... - rz_strbuf_reserve(&sb, match[0].rm_so + strlen(varstr) + tail_len + 32); - rz_strbuf_append_n(&sb, tstr, match[0].rm_so); + rz_strbuf_reserve(&sb, match_full->start + strlen(varstr) + tail_len + 32); + rz_strbuf_append_n(&sb, tstr, match_full->start); if (brackets) { rz_strbuf_append(&sb, "["); } @@ -332,10 +326,11 @@ static char *subvar_stack(RzParse *p, RzAnalysisOp *op, RZ_NULLABLE RzAnalysisFu if (brackets) { rz_strbuf_append(&sb, "]"); } - rz_strbuf_append_n(&sb, tstr + match[0].rm_eo, tail_len); + rz_strbuf_append_n(&sb, tstr + match_full->start + match_full->len, tail_len); free(reg_str); free(varstr); free(tstr); + rz_pvector_free(matches); return rz_strbuf_drain_nofree(&sb); } diff --git a/librz/parse/p/parse_mips_pseudo.c b/librz/parse/p/parse_mips_pseudo.c index 5e188de7bc9..633bd0e16bd 100644 --- a/librz/parse/p/parse_mips_pseudo.c +++ b/librz/parse/p/parse_mips_pseudo.c @@ -13,6 +13,7 @@ #include #include "parse_common.c" +#include static RzList /**/ *mips_tokenize(const char *assembly, size_t length); @@ -155,7 +156,7 @@ static char *subvar_stack(RzParse *p, RzAnalysisOp *op, RZ_NULLABLE RzAnalysisFu if (!p->pseudo) { // match e.g. -0x18(fp) // capturing "-0x18", "0x", "fp" - re_str = "(-?(0x)?[0-9a-f]+)\\(([a-z][0-9a-z]))"; + re_str = "(-?(0x)?[0-9a-f]+)\\(([a-z][0-9a-z])\\)"; group_idx_reg = 3; group_idx_sign = -1; group_idx_addend = 1; @@ -168,39 +169,36 @@ static char *subvar_stack(RzParse *p, RzAnalysisOp *op, RZ_NULLABLE RzAnalysisFu group_idx_addend = 3; } - RzRegex var_re; - if (rz_regex_comp(&var_re, re_str, RZ_REGEX_EXTENDED | RZ_REGEX_ICASE) != 0) { - rz_regex_fini(&var_re); + RzRegex *var_re = rz_regex_new(re_str, RZ_REGEX_EXTENDED | RZ_REGEX_CASELESS, 0); + if (!var_re) { return tstr; } - RzRegexMatch match[4]; - if (rz_regex_exec(&var_re, tstr, RZ_ARRAY_SIZE(match), match, 0) != 0) { - rz_regex_fini(&var_re); + RzPVector *matches = rz_regex_match_first(var_re, tstr, RZ_REGEX_ZERO_TERMINATED, 0, RZ_REGEX_DEFAULT); + if (!matches || rz_pvector_empty(matches)) { + rz_regex_free(var_re); + rz_pvector_free(matches); return tstr; } - for (size_t i = 0; i < RZ_ARRAY_SIZE(match); i++) { - char *s = rz_regex_match_extract(tstr, &match[i]); - free(s); - } - rz_regex_fini(&var_re); + rz_regex_free(var_re); - rz_return_val_if_fail(match[group_idx_reg].rm_so >= 0, tstr); - char *reg_str = rz_regex_match_extract(tstr, &match[group_idx_reg]); + rz_return_val_if_fail(rz_pvector_len(matches) > group_idx_reg, tstr); + RzRegexMatch *match = rz_pvector_at(matches, group_idx_reg); + char *reg_str = rz_str_ndup(tstr + match->start, match->len); if (!reg_str) { + rz_pvector_free(matches); return tstr; } - char *addend_str = rz_regex_match_extract(tstr, &match[group_idx_addend]); - if (!addend_str) { - free(reg_str); - return tstr; - } + rz_return_val_if_fail(rz_pvector_len(matches) >= group_idx_addend, tstr); + match = rz_pvector_at(matches, group_idx_addend); + const char *addend_str = tstr + match->start; st64 reg_addend = strtoll(addend_str, NULL, 0); - free(addend_str); if (group_idx_sign >= 0) { - rz_return_val_if_fail(match[group_idx_sign].rm_so >= 0, tstr); - if (tstr[match[group_idx_sign].rm_so] == '-') { + rz_return_val_if_fail(rz_pvector_len(matches) >= group_idx_sign, tstr); + match = rz_pvector_at(matches, group_idx_sign); + char sign = tstr[match->start]; + if (sign == '-') { reg_addend = -reg_addend; } } @@ -208,15 +206,18 @@ static char *subvar_stack(RzParse *p, RzAnalysisOp *op, RZ_NULLABLE RzAnalysisFu char *varstr = p->var_expr_for_reg_access(f, addr, reg_str, reg_addend); if (!varstr) { free(reg_str); + rz_pvector_free(matches); return tstr; } // information gathered, now perform the replacement in the string - size_t tail_len = strlen(tstr) - match[0].rm_eo; + RzRegexMatch *match_full = rz_pvector_at(matches, 0); + size_t tail_len = strlen(tstr) - (match_full->start + match_full->len); RzStrBuf sb; rz_strbuf_init(&sb); - rz_strbuf_reserve(&sb, match[0].rm_so + strlen(varstr) + tail_len + 32); - rz_strbuf_append_n(&sb, tstr, match[0].rm_so); + // reserve with a bit of padding for brackets, reg, whitespace, ... + rz_strbuf_reserve(&sb, match_full->start + strlen(varstr) + tail_len + 32); + rz_strbuf_append_n(&sb, tstr, match_full->start); if (p->localvar_only) { if (p->pseudo) { rz_strbuf_append(&sb, varstr); @@ -230,10 +231,11 @@ static char *subvar_stack(RzParse *p, RzAnalysisOp *op, RZ_NULLABLE RzAnalysisFu rz_strbuf_appendf(&sb, "%s(%s)", varstr, reg_str); } } - rz_strbuf_append_n(&sb, tstr + match[0].rm_eo, tail_len); + rz_strbuf_append_n(&sb, tstr + match_full->start + match_full->len, tail_len); free(reg_str); free(varstr); free(tstr); + rz_pvector_free(matches); return rz_strbuf_drain_nofree(&sb); } diff --git a/librz/parse/p/parse_x86_pseudo.c b/librz/parse/p/parse_x86_pseudo.c index 2ed40721e2f..e4d316931bf 100644 --- a/librz/parse/p/parse_x86_pseudo.c +++ b/librz/parse/p/parse_x86_pseudo.c @@ -291,7 +291,7 @@ static bool parse(RzParse *p, const char *data, RzStrBuf *sb) { return true; } -static char *subvar_stack(RzParse *p, RzAnalysisOp *op, RZ_NULLABLE RzAnalysisFunction *f, char *tstr, bool att) { +static char *subvar_stack(RzParse *p, RzAnalysisOp *op, RZ_NULLABLE RzAnalysisFunction *f, RZ_OWN char *tstr, bool att) { const ut64 addr = op->addr; if (!p->var_expr_for_reg_access || !f) { @@ -305,7 +305,7 @@ static char *subvar_stack(RzParse *p, RzAnalysisOp *op, RZ_NULLABLE RzAnalysisFu if (att) { // match e.g. -0x18(%rbp) // capturing "-0x18", "0x", "rbp" - re_str = "(-?(0x)?[0-9a-f]+)\\(%([re][0-9a-z][0-9a-z]))"; + re_str = "(-?(0x)?[0-9a-f]+)\\(%([re][0-9a-z][0-9a-z])\\)"; group_idx_reg = 3; group_idx_sign = -1; group_idx_addend = 1; @@ -318,46 +318,42 @@ static char *subvar_stack(RzParse *p, RzAnalysisOp *op, RZ_NULLABLE RzAnalysisFu group_idx_addend = 3; } - RzRegex var_re; - if (rz_regex_comp(&var_re, re_str, RZ_REGEX_EXTENDED | RZ_REGEX_ICASE) != 0) { - rz_regex_fini(&var_re); + RzRegex *var_re = rz_regex_new(re_str, RZ_REGEX_EXTENDED | RZ_REGEX_CASELESS, 0); + if (!var_re) { return tstr; } - RzRegexMatch match[4]; - if (rz_regex_exec(&var_re, tstr, RZ_ARRAY_SIZE(match), match, 0) != 0) { - rz_regex_fini(&var_re); + RzPVector *matches = rz_regex_match_first(var_re, tstr, RZ_REGEX_ZERO_TERMINATED, 0, RZ_REGEX_DEFAULT); + if (!matches || rz_pvector_empty(matches)) { + rz_regex_free(var_re); + rz_pvector_free(matches); return tstr; } - for (size_t i = 0; i < RZ_ARRAY_SIZE(match); i++) { - char *s = rz_regex_match_extract(tstr, &match[i]); - free(s); - } - rz_regex_fini(&var_re); + rz_regex_free(var_re); - rz_return_val_if_fail(match[group_idx_reg].rm_so >= 0, tstr); - char *reg_str = rz_regex_match_extract(tstr, &match[group_idx_reg]); + rz_return_val_if_fail(rz_pvector_len(matches) > group_idx_reg, tstr); + RzRegexMatch *match = rz_pvector_at(matches, group_idx_reg); + char *reg_str = rz_str_ndup(tstr + match->start, match->len); if (!reg_str) { + rz_pvector_free(matches); return tstr; } - char *addend_str = rz_regex_match_extract(tstr, &match[group_idx_addend]); - if (!addend_str) { - free(reg_str); - return tstr; - } + rz_return_val_if_fail(rz_pvector_len(matches) >= group_idx_addend, tstr); + match = rz_pvector_at(matches, group_idx_addend); + const char *addend_str = tstr + match->start; int base = 0; - size_t addend_len = strlen(addend_str); + size_t addend_len = match->len; if (addend_len && (addend_str[addend_len - 1] == 'h' || addend_str[addend_len - 1] == 'H')) { // MASM syntax prints hex numbers like `1234h` - addend_str[addend_len - 1] = '\0'; base = 16; } st64 reg_addend = strtoll(addend_str, NULL, base); - free(addend_str); if (group_idx_sign >= 0) { - rz_return_val_if_fail(match[group_idx_sign].rm_so >= 0, tstr); - if (tstr[match[group_idx_sign].rm_so] == '-') { + rz_return_val_if_fail(rz_pvector_len(matches) >= group_idx_sign, tstr); + match = rz_pvector_at(matches, group_idx_sign); + char sign = tstr[match->start]; + if (sign == '-') { reg_addend = -reg_addend; } } @@ -365,15 +361,18 @@ static char *subvar_stack(RzParse *p, RzAnalysisOp *op, RZ_NULLABLE RzAnalysisFu char *varstr = p->var_expr_for_reg_access(f, addr, reg_str, reg_addend); if (!varstr) { free(reg_str); + rz_pvector_free(matches); return tstr; } // replace! - size_t tail_len = strlen(tstr) - match[0].rm_eo; + RzRegexMatch *match_full = rz_pvector_at(matches, 0); + size_t tail_len = strlen(tstr) - (match_full->start + match_full->len); RzStrBuf sb; rz_strbuf_init(&sb); - rz_strbuf_reserve(&sb, match[0].rm_so + strlen(varstr) + tail_len + 32); - rz_strbuf_append_n(&sb, tstr, match[0].rm_so); + // reserve with a bit of padding for brackets, reg, whitespace, ... + rz_strbuf_reserve(&sb, match_full->start + strlen(varstr) + tail_len + 32); + rz_strbuf_append_n(&sb, tstr, match_full->start); if (!p->localvar_only && !att) { rz_strbuf_appendf(&sb, "%s %c ", reg_str, reg_addend < 0 ? '-' : '+'); } @@ -381,10 +380,11 @@ static char *subvar_stack(RzParse *p, RzAnalysisOp *op, RZ_NULLABLE RzAnalysisFu if (!p->localvar_only && att) { rz_strbuf_appendf(&sb, "(%%%s)", reg_str); } - rz_strbuf_append_n(&sb, tstr + match[0].rm_eo, tail_len); + rz_strbuf_append_n(&sb, tstr + match_full->start + match_full->len, tail_len); free(reg_str); free(varstr); free(tstr); + rz_pvector_free(matches); return rz_strbuf_drain_nofree(&sb); } diff --git a/librz/reg/profile.c b/librz/reg/profile.c index 284d4452679..820b2835cbe 100644 --- a/librz/reg/profile.c +++ b/librz/reg/profile.c @@ -288,7 +288,7 @@ static bool parse_reg_profile_str(RZ_OUT RzList /**/ *alias continue; } if (rz_str_strchr(line, "#")) { - RzList *line_and_cmt = rz_str_split_duplist_n_regex(line, "#", 0, true); + RzList *line_and_cmt = rz_str_split_duplist_n_regex(line, "\\#", 0, true); char *raw_comment = strdup(rz_list_get_top(line_and_cmt)); if (!raw_comment) { RZ_LOG_WARN("Comment could not be split from register definition. Line: \"%s\"\n", line); @@ -299,11 +299,11 @@ static bool parse_reg_profile_str(RZ_OUT RzList /**/ *alias RZ_LOG_WARN("Could not prepend # to comment. Line: \"%s\".\n", line); continue; } - toks = rz_str_split_duplist_n_regex(rz_list_get_bottom(line_and_cmt), "[[:blank:]]+", 0, true); + toks = rz_str_split_duplist_n_regex(rz_list_get_bottom(line_and_cmt), "\\s+", 0, true); rz_list_append(toks, comment); rz_list_free(line_and_cmt); } else { - toks = rz_str_split_duplist_n_regex(line, "[[:blank:]]+", 0, true); + toks = rz_str_split_duplist_n_regex(line, "\\s+", 0, true); } ut32 toks_len = rz_list_length(toks); if (rz_list_empty(toks)) { diff --git a/librz/search/regexp.c b/librz/search/regexp.c index 3139e314664..d35a5ae3c7b 100644 --- a/librz/search/regexp.c +++ b/librz/search/regexp.c @@ -3,48 +3,54 @@ // SPDX-License-Identifier: LGPL-3.0-only #include "rz_search.h" -#include +#include +#include +/** + * \return -1 on failure. + */ RZ_API int rz_search_regexp_update(RzSearch *s, ut64 from, const ut8 *buf, int len) { RzSearchKeyword *kw; RzListIter *iter; - RzRegexMatch match; - RzRegex compiled = { 0 }; + RzPVector *matches = NULL; + RzRegex *compiled = NULL; const int old_nhits = s->nhits; int ret = 0; rz_list_foreach (s->kws, iter, kw) { - int reflags = RZ_REGEX_EXTENDED; + int cflags = RZ_REGEX_EXTENDED; if (kw->icase) { - reflags |= RZ_REGEX_ICASE; + cflags |= RZ_REGEX_CASELESS; } - if (rz_regex_comp(&compiled, (char *)kw->bin_keyword, reflags)) { + compiled = rz_regex_new((char *)kw->bin_keyword, cflags, 0); + if (!compiled) { eprintf("Cannot compile '%s' regexp\n", kw->bin_keyword); return -1; } - match.rm_so = 0; - match.rm_eo = len; - - while (!rz_regex_exec(&compiled, (char *)buf, 1, &match, RZ_REGEX_STARTEND)) { - int t = rz_search_hit_new(s, kw, from + match.rm_so); - if (!t) { + matches = rz_regex_match_all_not_grouped(compiled, (char *)buf, len, from, RZ_REGEX_DEFAULT); + void **it; + rz_pvector_foreach (matches, it) { + RzRegexMatch *m = *it; + int t = rz_search_hit_new(s, kw, m->start); + if (t == 0) { ret = -1; + rz_pvector_free(matches); goto beach; } + // Max hits reached if (t > 1) { + rz_pvector_free(matches); goto beach; } - /* Setup the boundaries for RZ_REGEX_STARTEND */ - match.rm_so = match.rm_eo; - match.rm_eo = len; } + rz_pvector_free(matches); } beach: - rz_regex_fini(&compiled); + rz_regex_free(compiled); if (!ret) { ret = s->nhits - old_nhits; } diff --git a/librz/util/list.c b/librz/util/list.c index de4aa2ddc05..405a3575614 100644 --- a/librz/util/list.c +++ b/librz/util/list.c @@ -827,6 +827,10 @@ RZ_API RZ_OWN RzList *rz_list_uniq(RZ_NONNULL const RzList *list, RZ_NONNULL RzL /** * \brief Casts a RzList containg strings into a concatenated string * + * \param list The list of strings to concatenate. + * \param ch char to separate the match strings. + * + * \return The concatenated string. **/ RZ_API RZ_OWN char *rz_list_to_str(RZ_NONNULL RzList *list, char ch) { RzListIter *iter; diff --git a/librz/util/meson.build b/librz/util/meson.build index 8de72c48c32..eccbfa7f1fa 100644 --- a/librz/util/meson.build +++ b/librz/util/meson.build @@ -43,9 +43,7 @@ rz_util_common_sources = [ 'punycode.c', 'range.c', 'rbtree.c', - 'regex/regcomp.c', - 'regex/regerror.c', - 'regex/regexec.c', + 'regex.c', 'serialize_spaces.c', 'signal.c', 'skiplist.c', @@ -91,7 +89,7 @@ rz_util_common_sources = [ ] rz_util_sources = rz_util_common_sources -rz_util_deps = [ldl, lrt, mth, th, utl] + platform_deps +rz_util_deps = [ldl, lrt, mth, th, utl, pcre2_dep] + platform_deps if zlib_dep.found() rz_util_deps += [zlib_dep] endif @@ -149,7 +147,7 @@ if meson.is_cross_build() cc_native.find_library('psapi'), ] endif - rz_util_native_deps = [ldl_native, lrt_native, mth_native, th_native, utl_native] + platform_native_deps + rz_util_native_deps = [ldl_native, lrt_native, mth_native, th_native, utl_native, pcre2_dep] + platform_native_deps if execinfo_native.found() rz_util_native_deps += [execinfo_native] endif diff --git a/librz/util/print.c b/librz/util/print.c index fe3441ce732..69d426f69d4 100644 --- a/librz/util/print.c +++ b/librz/util/print.c @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/librz/util/regex.c b/librz/util/regex.c new file mode 100644 index 00000000000..bf85fded2e0 --- /dev/null +++ b/librz/util/regex.c @@ -0,0 +1,411 @@ +// SPDX-FileCopyrightText: 2023 Rot127 +// SPDX-License-Identifier: LGPL-3.0-only + +#define PCRE2_STATIC +#define PCRE2_CODE_UNIT_WIDTH 8 +#include + +#include +#include +#include +#include +#include +#include + +typedef pcre2_general_context RzRegexGeneralContext; ///< General context. +typedef pcre2_compile_context RzRegexCompContext; ///< The context for compiling. +typedef pcre2_match_context RzRegexMatchContext; ///< The context for matching. + +typedef struct { + RzRegexGeneralContext *general; + RzRegexCompContext *compile; + RzRegexMatchContext *match; +} RzRegexContexts; + +static void print_pcre2_err(RZ_NULLABLE const char *pattern, RzRegexStatus err_num, size_t err_off) { + PCRE2_UCHAR buffer[256]; + pcre2_get_error_message(err_num, buffer, sizeof(buffer)); + RZ_LOG_ERROR("Regex compilation for '%s' failed at %" PFMTSZu ": %s\n", pattern ? pattern : "(null)", err_off, + buffer); +} + +/** + * \brief Compile a regex pattern to a RzRegex and return it. + * In case of an error, an error message is printed and NULL is returned. + * + * \param pattern The regex pattern string. + * \param cflags The compilation flags or zero for default. + * \param jflags The compilation flags for the JIT compiler. + * You can pass RZ_REGEX_JIT_PARTIAL_SOFT or RZ_REGEX_JIT_PARTIAL_HARD if you + * intend to use the pattern for partial matching. Otherwise set it to 0. + * + * \return The compiled regex or NULL in case of failure. + */ +RZ_API RZ_OWN RzRegex *rz_regex_new(RZ_NONNULL const char *pattern, RzRegexFlags cflags, RzRegexFlags jflags) { + rz_return_val_if_fail(pattern, NULL); + + RzRegexStatus err_num; + RzRegexSize err_off; + ut32 supported = 0; + pcre2_config(PCRE2_CONFIG_UNICODE, &supported); + if (supported != 1) { + RZ_LOG_ERROR("Unicode not supported by PCRE2 library."); + return NULL; + } + char *fixed_pat = NULL; + const char *pat = NULL; + if ((cflags & RZ_REGEX_EXTENDED) || (cflags & RZ_REGEX_EXTENDED_MORE)) { + if (!strchr(pattern, ' ')) { + pat = pattern; + } else { + // In PCRE2 with the extended flag set, ascii space characters ' ' are skipped. + // We need to replace them with \s unfortunately to keep our API stable. + fixed_pat = rz_str_replace(strdup(pattern), " ", "\\s", 1); + pat = fixed_pat; + } + } else { + pat = pattern; + } + + RzRegex *regex = pcre2_compile( + (PCRE2_SPTR)pat, + PCRE2_ZERO_TERMINATED, + cflags | PCRE2_UTF | PCRE2_MATCH_INVALID_UTF, + &err_num, + &err_off, + NULL); + if (!regex) { + print_pcre2_err(pat, err_num, err_off); + free(fixed_pat); + return NULL; + } +#ifdef SUPPORTS_PCRE2_JIT + RzRegexStatus jit_err = pcre2_jit_compile(regex, jflags | PCRE2_JIT_COMPLETE); + if (jit_err < 0) { + print_pcre2_err(pat, jit_err, 0); + } +#endif + free(fixed_pat); + return regex; +} + +/** + * \brief Frees a given RzRegex. + * + * \param regex The RzRegex to free. + */ +RZ_API void rz_regex_free(RZ_OWN RzRegex *regex) { + pcre2_code_free(regex); +} + +static void rz_regex_match_data_free(RZ_OWN RzRegexMatchData *match_data) { + pcre2_match_data_free(match_data); +} + +/** + * \brief Matches the \p regex in the \p text and returns a status code with the result. + * + * \param regex The regex pattern to match. + * \param text The text to search in. + * \param text_size The length of the buffer pointed to by \p text. + * Can be set to RZ_REGEX_ZERO_TERMINATED if the buffer is a zero terminated string. + * \param text_offset The offset into \p text from where the search starts. + * \param mflags Match flags. + * + * \return A status code which describes the result. + */ +RZ_API RzRegexStatus rz_regex_match(RZ_NONNULL const RzRegex *regex, RZ_NONNULL const char *text, + RzRegexSize text_size, + RzRegexSize text_offset, + RzRegexFlags mflags) { + rz_return_val_if_fail(regex && text, RZ_REGEX_ERROR_NOMATCH); + + pcre2_match_data *mdata = pcre2_match_data_create_from_pattern(regex, NULL); + RzRegexStatus rc = pcre2_match(regex, (PCRE2_SPTR)text, text_size, text_offset, mflags | PCRE2_NO_UTF_CHECK, mdata, NULL); + pcre2_match_data_free(mdata); + return rc; +} + +/** + * \brief Generates the error message to \p errcode. + * + * \param errcode The error code. + * \param errbuf The error message buffer. + * \param errbuf_size The error message buffer size in bytes. + */ +RZ_API void rz_regex_error_msg(RzRegexStatus errcode, RZ_OUT char *errbuf, RzRegexSize errbuf_size) { + pcre2_get_error_message(errcode, (PCRE2_UCHAR *)errbuf, errbuf_size); +} + +/** + * \brief Returns the name of a group. + * + * \param regex The regex expression with named groups. + * \param group_idx The index of the group to get the name for. + * + * \return The name of the group or NULL in case of failure or non is was set. + */ +RZ_API const ut8 *rz_regex_get_match_name(RZ_NONNULL const RzRegex *regex, ut32 group_idx) { + rz_return_val_if_fail(regex, NULL); + + ut32 namecount; + ut32 name_entry_size; + PCRE2_SPTR nametable_ptr; + + pcre2_pattern_info( + regex, + PCRE2_INFO_NAMECOUNT, + &namecount); + + pcre2_pattern_info( + regex, + PCRE2_INFO_NAMETABLE, + &nametable_ptr); + + pcre2_pattern_info( + regex, + PCRE2_INFO_NAMEENTRYSIZE, + &name_entry_size); + + for (size_t i = 0; i < namecount; i++) { + int n = (nametable_ptr[0] << 8) | nametable_ptr[1]; + if (n == group_idx) { + return nametable_ptr + 2; + } + nametable_ptr += name_entry_size; + } + return NULL; +} + +/** + * \brief Finds the first match in a text and returns it as a pvector. + * First element in the vector is always the whole match, the following possible groups. + * + * \param regex The regex pattern to match. + * \param text The text to search in. + * \param text_size The length of the buffer pointed to by \p text. + * Can be set to RZ_REGEX_ZERO_TERMINATED if the buffer is a zero terminated string. + * \param text_offset The offset into \p text from where the search starts. + * \param mflags Match flags. + * + * \return The matches as pvector. NULL in case of failure. Empty for no matches or regex related errors. + */ +RZ_API RZ_OWN RzPVector /**/ *rz_regex_match_first( + RZ_NONNULL const RzRegex *regex, + RZ_NONNULL const char *text, + RzRegexSize text_size, + RzRegexSize text_offset, + RzRegexFlags mflags) { + rz_return_val_if_fail(regex && text, NULL); + + RzPVector *matches = rz_pvector_new(NULL); + RzRegexMatchData *mdata = pcre2_match_data_create_from_pattern(regex, NULL); + RzRegexStatus rc = pcre2_match(regex, (PCRE2_SPTR)text, text_size, text_offset, mflags | PCRE2_NO_UTF_CHECK, mdata, NULL); + + if (rc == PCRE2_ERROR_NOMATCH) { + // Nothing matched return empty vector. + goto fini; + } + + if (rc < 0) { + // Some error happend. Inform the user. + PCRE2_UCHAR buffer[256]; + pcre2_get_error_message(rc, buffer, sizeof(buffer)); + RZ_LOG_WARN("Regex matching failed: %s\n", buffer); + goto fini; + } + + // Add groups to vector + PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(mdata); + + ut32 name_entry_size; + PCRE2_SPTR nametable_ptr; + + pcre2_pattern_info( + regex, + PCRE2_INFO_NAMETABLE, + &nametable_ptr); + + pcre2_pattern_info( + regex, + PCRE2_INFO_NAMEENTRYSIZE, + &name_entry_size); + + for (size_t i = 0; i < rc; i++) { + if (ovector[2 * i] > ovector[2 * i + 1]) { + // This happens for \K lookaround. We fail if used. + // See pcre2demo.c for details. + RZ_LOG_ERROR("Usage of \\K to set start of the pattern later than the end, is not implemented.\n"); + goto fini; + } + + // Offset and length of match + RzRegexMatch *match = RZ_NEW0(RzRegexMatch); + match->start = ovector[2 * i]; + match->len = ovector[2 * i + 1] - match->start; + match->group_idx = i; + nametable_ptr += name_entry_size; + rz_pvector_push(matches, match); + } + +fini: + rz_regex_match_data_free(mdata); + return matches; +} + +/** + * \brief Finds all matches in a text and returns them as vector. + * The result is a flat vector of matches. A single match with multiple + * groups is simply appeneded to the resulting vector. + * + * \param regex The regex pattern to match. + * \param text The text to search in. + * \param text_size The length of the buffer pointed to by \p text. + * Can be set to RZ_REGEX_ZERO_TERMINATED if the buffer is a zero terminated string. + * \param text_offset The offset into \p text from where the search starts. + * \param mflags Match flags. + * + * \return A vector of all matches or NULL in case of failure. + * Sub-groups of a match are appended after their main match. + */ +RZ_API RZ_OWN RzPVector /**/ *rz_regex_match_all_not_grouped( + RZ_NONNULL const RzRegex *regex, + RZ_NONNULL const char *text, + RzRegexSize text_size, + RzRegexSize text_offset, + RzRegexFlags mflags) { + rz_return_val_if_fail(regex && text, NULL); + + RzPVector *all_matches = rz_pvector_new(NULL); + RzPVector *matches = rz_regex_match_first(regex, text, text_size, text_offset, mflags); + while (matches && rz_pvector_len(matches) > 0) { + RzRegexMatch *whole_match = rz_pvector_head(matches); + text_offset = whole_match->start + whole_match->len; + + size_t mlen = rz_pvector_len(matches); + for (size_t i = 0; i < mlen; ++i) { + RzRegexMatch *m = rz_pvector_pop_front(matches); + rz_pvector_push(all_matches, m); + } + rz_pvector_free(matches); + // Search again after the whole first match. + matches = rz_regex_match_first(regex, text, text_size, text_offset, mflags); + } + + // Free last vector without matches. + rz_pvector_free(matches); + return all_matches; +} + +/** + * \brief Finds all matches in a text and returns them as vector of vector matches. + * + * \param pattern The regex pattern to match. + * \param text The text to search in. + * \param text_size The length of the buffer pointed to by \p text. + * Can be set to RZ_REGEX_ZERO_TERMINATED if the buffer is a zero terminated string. + * \param text_offset The offset into \p text from where the search starts. + * \param mflags Match flags. + * + * \return PVector of every match in the given string or NULL in case of failure. + * One match with all its groups is again assembled in a pvector. + */ +RZ_API RZ_OWN RzPVector /* *>*/ *rz_regex_match_all( + RZ_NONNULL const RzRegex *regex, + RZ_NONNULL const char *text, + RzRegexSize text_size, + RzRegexSize text_offset, + RzRegexFlags mflags) { + rz_return_val_if_fail(regex && text, NULL); + + RzPVector *all_matches = rz_pvector_new((RzPVectorFree)rz_pvector_free); + RzPVector *matches = rz_regex_match_first(regex, text, text_size, text_offset, mflags); + while (matches && rz_pvector_len(matches) > 0) { + rz_pvector_push(all_matches, matches); + RzRegexMatch *m = rz_pvector_head(matches); + // Search again after the last match. + text_offset = m->start + m->len; + matches = rz_regex_match_first(regex, text, text_size, text_offset, mflags); + } + + // Free last vector without matches. + rz_pvector_free(matches); + return all_matches; +} + +/** + * \brief Checks if \p pattern can be found in \p text. + * + * \param pattern The regex pattern to match. + * \param text The text to search in. + * \param text_size The length of the buffer pointed to by \p text. + * Can be set to RZ_REGEX_ZERO_TERMINATED if the buffer is a zero terminated string. + * \param text_offset The offset into \p text from where the search starts. + * \param cflags Compile flags. + * \param mflags Match flags. + * + * \return true if the text contains the patterns. + * \return false Otherwise + */ +RZ_API bool rz_regex_contains(RZ_NONNULL const char *pattern, RZ_NONNULL const char *text, + RzRegexSize text_size, + RzRegexFlags cflags, RzRegexFlags mflags) { + RzRegex *re = rz_regex_new(pattern, cflags, 0); + if (!re) { + return false; + } + RzPVector *matches = rz_regex_match_first(re, text, text_size, 0, mflags); + bool found = matches != NULL && !rz_pvector_empty(matches); + rz_pvector_free(matches); + return found; +} + +/** + * \brief Searches for a \p pattern in \p text and returns all matches as concatenated string. + * Only complete matches are concatenated. Sub-groups are skipped. + * + * \param pattern The regex pattern to match. + * \param text The text to search in. + * \param text_size The length of the buffer pointed to by \p text. + * Can be set to RZ_REGEX_ZERO_TERMINATED if the buffer is a zero terminated string. + * \param text_offset The offset into \p text from where the search starts. + * \param cflags Compile flags. + * \param mflags Match flags. + * \param separator A string to separate the matches. + * + * \return A string with all matches concatenated or NULL in case of failure. + */ +RZ_API RZ_OWN RzStrBuf *rz_regex_full_match_str(RZ_NONNULL const char *pattern, RZ_NONNULL const char *text, + RzRegexSize text_size, + RzRegexFlags cflags, RzRegexFlags mflags, RZ_NONNULL const char *separator) { + rz_return_val_if_fail(pattern && text && separator, NULL); + + RzRegex *re = rz_regex_new(pattern, cflags, 0); + RzStrBuf *sbuf = rz_strbuf_new(""); + RzPVector *matches = rz_regex_match_all(re, text, text_size, 0, mflags); + if (!matches || !sbuf) { + goto fini; + } + + size_t i = 1; + void **m; + rz_pvector_foreach (matches, m) { + RzPVector *match_groups = *m; + RzRegexMatch *match = rz_pvector_head(match_groups); + const char *t = text + match->start; + if (((int)match->len) < 0) { + goto fini; + } + // No separator in case of only one match + if (i == rz_pvector_len(matches)) { + rz_strbuf_appendf(sbuf, "%-.*s", (int)match->len, t); + } else if (!rz_strbuf_appendf(sbuf, "%-.*s%s", (int)match->len, t, separator)) { + goto fini; + } + ++i; + } + +fini: + rz_pvector_free(matches); + return sbuf; +} diff --git a/librz/util/regex/COPYRIGHT b/librz/util/regex/COPYRIGHT deleted file mode 100644 index a6392fd37c3..00000000000 --- a/librz/util/regex/COPYRIGHT +++ /dev/null @@ -1,54 +0,0 @@ -$OpenBSD: COPYRIGHT,v 1.3 2003/06/02 20:18:36 millert Exp $ - -Copyright 1992, 1993, 1994 Henry Spencer. All rights reserved. -This software is not subject to any license of the American Telephone -and Telegraph Company or of the Regents of the University of California. - -Permission is granted to anyone to use this software for any purpose on -any computer system, and to alter it and redistribute it, subject -to the following restrictions: - -1. The author is not responsible for the consequences of use of this - software, no matter how awful, even if they arise from flaws in it. - -2. The origin of this software must not be misrepresented, either by - explicit claim or by omission. Since few users ever read sources, - credits must appear in the documentation. - -3. Altered versions must be plainly marked as such, and must not be - misrepresented as being the original software. Since few users - ever read sources, credits must appear in the documentation. - -4. This notice may not be removed or altered. - -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -/*- - * Copyright (c) 1994 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)COPYRIGHT 8.1 (Berkeley) 3/16/94 - */ diff --git a/librz/util/regex/README b/librz/util/regex/README deleted file mode 100644 index cc7acd629fd..00000000000 --- a/librz/util/regex/README +++ /dev/null @@ -1,5 +0,0 @@ -Based on the OpenBSD's regex implementation - -Modified to be portable (now compiles on windows, linux and *bsd including darwin) - -cvs -qd anoncvs@anoncvs.ca.openbsd.org:/cvs get -P src/lib/libc/regex diff --git a/librz/util/regex/cclass.h b/librz/util/regex/cclass.h deleted file mode 100644 index 00b46e37b62..00000000000 --- a/librz/util/regex/cclass.h +++ /dev/null @@ -1,70 +0,0 @@ -/* $OpenBSD: cclass.h,v 1.5 2003/06/02 20:18:36 millert Exp $ */ - -/*- - * Copyright (c) 1992, 1993, 1994 Henry Spencer. - * Copyright (c) 1992, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Henry Spencer. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)cclass.h 8.3 (Berkeley) 3/20/94 - */ - -/* character-class table */ -static struct cclass { - char *name; - char *chars; - char *multis; -} cclasses[] = { - { "alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ -0123456789", - "" }, - { "alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", - "" }, - { "blank", " \t", "" }, - { "cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\ -\25\26\27\30\31\32\33\34\35\36\37\177", - "" }, - { "digit", "0123456789", "" }, - { "graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ -0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", - "" }, - { "lower", "abcdefghijklmnopqrstuvwxyz", - "" }, - { "print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ -0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ", - "" }, - { "punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", - "" }, - { "space", "\t\n\v\f\r ", "" }, - { "upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", - "" }, - { "xdigit", "0123456789ABCDEFabcdef", - "" }, - { NULL, 0, "" } -}; diff --git a/librz/util/regex/cname.h b/librz/util/regex/cname.h deleted file mode 100644 index f17991aa285..00000000000 --- a/librz/util/regex/cname.h +++ /dev/null @@ -1,139 +0,0 @@ -/* $OpenBSD: cname.h,v 1.5 2003/06/02 20:18:36 millert Exp $ */ - -/*- - * Copyright (c) 1992, 1993, 1994 Henry Spencer. - * Copyright (c) 1992, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Henry Spencer. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)cname.h 8.3 (Berkeley) 3/20/94 - */ - -/* character-name table */ -static struct cname { - char *name; - char code; -} cnames[] = { - { "NUL", '\0' }, - { "SOH", '\001' }, - { "STX", '\002' }, - { "ETX", '\003' }, - { "EOT", '\004' }, - { "ENQ", '\005' }, - { "ACK", '\006' }, - { "BEL", '\007' }, - { "alert", '\007' }, - { "BS", '\010' }, - { "backspace", '\b' }, - { "HT", '\011' }, - { "tab", '\t' }, - { "LF", '\012' }, - { "newline", '\n' }, - { "VT", '\013' }, - { "vertical-tab", '\v' }, - { "FF", '\014' }, - { "form-feed", '\f' }, - { "CR", '\015' }, - { "carriage-return", '\r' }, - { "SO", '\016' }, - { "SI", '\017' }, - { "DLE", '\020' }, - { "DC1", '\021' }, - { "DC2", '\022' }, - { "DC3", '\023' }, - { "DC4", '\024' }, - { "NAK", '\025' }, - { "SYN", '\026' }, - { "ETB", '\027' }, - { "CAN", '\030' }, - { "EM", '\031' }, - { "SUB", '\032' }, - { "ESC", '\033' }, - { "IS4", '\034' }, - { "FS", '\034' }, - { "IS3", '\035' }, - { "GS", '\035' }, - { "IS2", '\036' }, - { "RS", '\036' }, - { "IS1", '\037' }, - { "US", '\037' }, - { "space", ' ' }, - { "exclamation-mark", '!' }, - { "quotation-mark", '"' }, - { "number-sign", '#' }, - { "dollar-sign", '$' }, - { "percent-sign", '%' }, - { "ampersand", '&' }, - { "apostrophe", '\'' }, - { "left-parenthesis", '(' }, - { "right-parenthesis", ')' }, - { "asterisk", '*' }, - { "plus-sign", '+' }, - { "comma", ',' }, - { "hyphen", '-' }, - { "hyphen-minus", '-' }, - { "period", '.' }, - { "full-stop", '.' }, - { "slash", '/' }, - { "solidus", '/' }, - { "zero", '0' }, - { "one", '1' }, - { "two", '2' }, - { "three", '3' }, - { "four", '4' }, - { "five", '5' }, - { "six", '6' }, - { "seven", '7' }, - { "eight", '8' }, - { "nine", '9' }, - { "colon", ':' }, - { "semicolon", ';' }, - { "less-than-sign", '<' }, - { "equals-sign", '=' }, - { "greater-than-sign", '>' }, - { "question-mark", '?' }, - { "commercial-at", '@' }, - { "left-square-bracket", '[' }, - { "backslash", '\\' }, - { "reverse-solidus", '\\' }, - { "right-square-bracket", ']' }, - { "circumflex", '^' }, - { "circumflex-accent", '^' }, - { "underscore", '_' }, - { "low-line", '_' }, - { "grave-accent", '`' }, - { "left-brace", '{' }, - { "left-curly-bracket", '{' }, - { "vertical-line", '|' }, - { "right-brace", '}' }, - { "right-curly-bracket", '}' }, - { "tilde", '~' }, - { "DEL", '\177' }, - { NULL, 0 } -}; diff --git a/librz/util/regex/engine.c b/librz/util/regex/engine.c deleted file mode 100644 index 1615aeda364..00000000000 --- a/librz/util/regex/engine.c +++ /dev/null @@ -1,1076 +0,0 @@ -/* $OpenBSD: engine.c,v 1.15 2005/08/05 13:03:00 espie Exp $ */ - -/*- - * Copyright (c) 1992, 1993, 1994 Henry Spencer. - * Copyright (c) 1992, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Henry Spencer. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)engine.c 8.5 (Berkeley) 3/20/94 - */ - -/* - * The matching engine and friends. This file is #included by regexec.c - * after suitable #defines of a variety of macros used herein, so that - * different state representations can be used without duplicating masses - * of code. - */ - -#ifdef SNAMES -#define matcher smatcher -#define fast sfast -#define slow sslow -#define dissect sdissect -#define backref sbackref -#define step sstep -#define print sprint -#define at sat -#define match smat -#define nope snope -#endif -#ifdef LNAMES -#define matcher lmatcher -#define fast lfast -#define slow lslow -#define dissect ldissect -#define backref lbackref -#define step lstep -#define print lprint -#define at lat -#define match lmat -#define nope lnope -#endif - -/* another structure passed up and down to avoid zillions of parameters */ -struct match { - struct re_guts *g; - int eflags; - RzRegexMatch *pmatch; /* [nsub+1] (0 element unused) */ - char *offp; /* offsets work from here */ - char *beginp; /* start of string -- virtual NUL precedes */ - char *endp; /* end of string -- virtual NUL here */ - char *coldp; /* can be no match starting before here */ - char **lastpos; /* [nplus+1] */ - STATEVARS; - states st; /* current states */ - states fresh; /* states for a fresh start */ - states tmp; /* temporary */ - states empty; /* empty set of states */ -}; - -static int matcher(struct re_guts *, char *, size_t, RzRegexMatch[], int); -static char *dissect(struct match *, char *, char *, sopno, sopno); -static char *backref(struct match *, char *, char *, sopno, sopno, sopno, int); -static char *fast(struct match *, char *, char *, sopno, sopno); -static char *slow(struct match *, char *, char *, sopno, sopno); -static states step(struct re_guts *, sopno, sopno, states, int, states); -#define MAX_RECURSION 100 -#define BOL (OUT + 1) -#define EOL (BOL + 1) -#define BOLEOL (BOL + 2) -#define NOTHING (BOL + 3) -#define BOW (BOL + 4) -#define EOW (BOL + 5) -#define CODEMAX (BOL + 5) /* highest code used */ -#define NONCHAR(c) ((c) > OUT) -#define NNONCHAR (CODEMAX - OUT) -#ifdef REDEBUG -static void print(struct match *, char *, states, int, FILE *); -#endif -#ifdef REDEBUG -static void at(struct match *, char *, char *, char *, sopno, sopno); -#endif -#ifdef REDEBUG -static char *pchar(int); -#endif - -#ifdef REDEBUG -#define SP(t, s, c) print(m, t, s, c, stdout) -#define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) -#define NOTE(str) \ - { \ - if (m->eflags & RZ_REGEX_TRACE) \ - (void)printf("=%s\n", (str)); \ - } -static int nope = 0; -#else -#define SP(t, s, c) /* nothing */ -#define AT(t, p1, p2, s1, s2) /* nothing */ -#define NOTE(s) /* nothing */ -#endif - -/* - - matcher - the actual matching engine - */ -static int /* 0 success, RZ_REGEX_NOMATCH failure */ -matcher(struct re_guts *g, char *string, size_t nmatch, RzRegexMatch pmatch[], - int eflags) { - char *endp; - int i; - struct match mv; - struct match *m = &mv; - char *dp; - const sopno gf = g->firststate + 1; /* +1 for OEND */ - const sopno gl = g->laststate; - char *start; - char *stop; - - /* simplify the situation where possible */ - if (g->cflags & RZ_REGEX_NOSUB) - nmatch = 0; - if (eflags & RZ_REGEX_STARTEND) { - start = string + pmatch[0].rm_so; - stop = string + pmatch[0].rm_eo; - } else { - start = string; - stop = start + strlen(start); - } - if (stop < start) - return (RZ_REGEX_INVARG); - - /* prescreening; this does wonders for this rather slow code */ - if (g->must != NULL) { - for (dp = start; dp < stop; dp++) - if (*dp == g->must[0] && stop - dp >= g->mlen && - memcmp(dp, g->must, (size_t)g->mlen) == 0) - break; - if (dp == stop) /* we didn't find g->must */ - return (RZ_REGEX_NOMATCH); - } - - /* match struct setup */ - m->g = g; - m->eflags = eflags; - m->pmatch = NULL; - m->lastpos = NULL; - m->offp = string; - m->beginp = start; - m->endp = stop; - - if (m->g->nstates * 4 < m->g->nstates) - return RZ_REGEX_NOMATCH; - STATESETUP(m, 4); - SETUP(m->st); - SETUP(m->fresh); - SETUP(m->tmp); - SETUP(m->empty); - CLEAR(m->empty); - - /* this loop does only one repetition except for backrefs */ - for (;;) { - endp = fast(m, start, stop, gf, gl); - if (!endp) { /* a miss */ - free(m->pmatch); - free(m->lastpos); - STATETEARDOWN(m); - return (RZ_REGEX_NOMATCH); - } - if (nmatch == 0 && !g->backrefs) - break; /* no further info needed */ - - /* where? */ - if (!m->coldp) { - break; - } - for (;;) { - NOTE("finding start"); - endp = slow(m, m->coldp, stop, gf, gl); - if (endp || m->coldp > m->endp) { - break; - } - m->coldp++; - } - if (nmatch == 1 && !g->backrefs) - break; /* no further info needed */ - - /* oh my, he wants the subexpressions... */ - if (!m->pmatch) { - if ((m->g->nsub + 1) * sizeof(RzRegexMatch) < m->g->nsub) { - return RZ_REGEX_ESPACE; - } - m->pmatch = (RzRegexMatch *)malloc((m->g->nsub + 1) * - sizeof(RzRegexMatch)); - } - if (!m->pmatch) { - STATETEARDOWN(m); - return (RZ_REGEX_ESPACE); - } - for (i = 1; i <= m->g->nsub; i++) - m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; - if (!g->backrefs && !(m->eflags & RZ_REGEX_BACKR)) { - NOTE("dissecting"); - dp = dissect(m, m->coldp, endp, gf, gl); - } else { - if (g->nplus > 0 && !m->lastpos) { - if ((g->nplus + 1) * sizeof(char *) < g->nplus) { - free(m->pmatch); - STATETEARDOWN(m); - return RZ_REGEX_ESPACE; - } - m->lastpos = (char **)malloc((g->nplus + 1) * - sizeof(char *)); - } - if (g->nplus > 0 && !m->lastpos) { - free(m->pmatch); - STATETEARDOWN(m); - return (RZ_REGEX_ESPACE); - } - NOTE("backref dissect"); - dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); - } - if (dp) { - break; - } - /* uh-oh... we couldn't find a subexpression-level match */ - if (!g->backrefs) { /* must be back references doing it */ - break; - } - if (g->nplus || !m->lastpos) { - break; - } - for (;;) { - if (dp != NULL || endp <= m->coldp) - break; /* defeat */ - NOTE("backoff"); - endp = slow(m, m->coldp, endp - 1, gf, gl); - if (!endp) - break; /* defeat */ - /* try it on a shorter possibility */ -#ifndef NDEBUG - for (i = 1; i <= m->g->nsub; i++) { - if (m->pmatch[i].rm_so != -1) { - break; - } - if (m->pmatch[i].rm_eo != -1) { - break; - } - } -#endif - NOTE("backoff dissect"); - dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); - } - if (dp != NULL || dp != endp) /* found a shorter one */ - break; - - /* despite initial appearances, there is no match here */ - NOTE("false alarm"); - if (m->coldp == stop) - break; - start = m->coldp + 1; /* recycle starting later */ - } - - /* fill in the details if requested */ - if (nmatch > 0) { - pmatch[0].rm_so = m->coldp - m->offp; - pmatch[0].rm_eo = endp - m->offp; - } - if (nmatch > 1) { - if (m->pmatch) { - for (i = 1; i < nmatch; i++) { - if (i <= m->g->nsub) { - pmatch[i] = m->pmatch[i]; - } else { - pmatch[i].rm_so = -1; - pmatch[i].rm_eo = -1; - } - } - } - } - - if (m->pmatch != NULL) - free((char *)m->pmatch); - if (m->lastpos != NULL) - free((char *)m->lastpos); - STATETEARDOWN(m); - return (0); -} - -/* - - dissect - figure out what matched what, no back references - */ -static char * /* == stop (success) always */ -dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst) { - int i; - sopno ss; /* start sop of current subRE */ - sopno es; /* end sop of current subRE */ - char *sp; /* start of string matched by it */ - char *stp; /* string matched by it cannot pass here */ - char *rest; /* start of rest of string */ - char *tail; /* string unmatched by rest of RE */ - sopno ssub; /* start sop of subsubRE */ - sopno esub; /* end sop of subsubRE */ - char *ssp; /* start of string matched by subsubRE */ - char *sep; /* end of string matched by subsubRE */ - char *oldssp; /* previous ssp */ - char *dp; - - AT("diss", start, stop, startst, stopst); - sp = start; - for (ss = startst; ss < stopst; ss = es) { - /* identify end of subRE */ - es = ss; - switch (OP(m->g->strip[es])) { - case OPLUS_: - case OQUEST_: - es += OPND(m->g->strip[es]); - break; - case OCH_: - while (OP(m->g->strip[es]) != O_CH) - es += OPND(m->g->strip[es]); - break; - } - es++; - - /* figure out what it matched */ - switch (OP(m->g->strip[ss])) { - case OEND: - break; - case OCHAR: - sp++; - break; - case OBOL: - case OEOL: - case OBOW: - case OEOW: - break; - case OANY: - case OANYOF: - sp++; - break; - case OBACK_: - case O_BACK: - break; - /* cases where length of match is hard to find */ - case OQUEST_: - stp = stop; - for (;;) { - /* how long could this one be? */ - rest = slow(m, sp, stp, ss, es); - if (rest) { /* it did match */ - /* could the rest match the rest? */ - tail = slow(m, rest, stop, es, stopst); - if (tail == stop) - break; /* yes! */ - /* no -- try a shorter match for this one */ - stp = rest - 1; - } - } - ssub = ss + 1; - esub = es - 1; - /* did innards match? */ - if (slow(m, sp, rest, ssub, esub) != NULL) { - dp = dissect(m, sp, rest, ssub, esub); - if (dp != rest) - return NULL; - } else if (sp != rest) - return NULL; - sp = rest; - break; - case OPLUS_: - stp = stop; - for (;;) { - /* how long could this one be? */ - rest = slow(m, sp, stp, ss, es); - if (rest != NULL) { /* it did match */ - /* could the rest match the rest? */ - tail = slow(m, rest, stop, es, stopst); - if (tail == stop) - break; /* yes! */ - /* no -- try a shorter match for this one */ - stp = rest - 1; - } - } - ssub = ss + 1; - esub = es - 1; - ssp = sp; - oldssp = ssp; - for (;;) { /* find last match of innards */ - sep = slow(m, ssp, rest, ssub, esub); - if (!sep || sep == ssp) - break; /* failed or matched null */ - oldssp = ssp; /* on to next try */ - ssp = sep; - } - if (!sep) { - /* last successful match */ - sep = ssp; - ssp = oldssp; - } - if (sep == rest) { /* must exhaust substring */ - if (slow(m, ssp, sep, ssub, esub) == rest) { - dp = dissect(m, ssp, sep, ssub, esub); - if (dp == sep) { - sp = rest; - } - } - } - break; - case OCH_: - stp = stop; - for (;;) { - /* how long could this one be? */ - rest = slow(m, sp, stp, ss, es); - if (rest) { /* it did match */ - /* could the rest match the rest? */ - tail = slow(m, rest, stop, es, stopst); - if (tail == stop) - break; /* yes! */ - /* no -- try a shorter match for this one */ - stp = rest - 1; - } - } - ssub = ss + 1; - esub = ss + OPND(m->g->strip[ss]) - 1; - if (OP(m->g->strip[esub]) != OOR1) { - break; - } - for (;;) { /* find first matching branch */ - if (slow(m, sp, rest, ssub, esub) == rest) - break; /* it matched all of it */ - /* that one missed, try next one */ - if (OP(m->g->strip[esub]) == OOR1) { - esub++; - if (OP(m->g->strip[esub]) == OOR2) { - ssub = esub + 1; - esub += OPND(m->g->strip[esub]); - if (OP(m->g->strip[esub]) == OOR2) { - esub--; - } else { - if (OP(m->g->strip[esub]) != O_CH) { - break; - } - } - } - } - } - dp = dissect(m, sp, rest, ssub, esub); - if (dp == rest) { - sp = rest; - } - break; - case O_PLUS: - case O_QUEST: - case OOR1: - case OOR2: - case O_CH: - break; - case OLPAREN: - i = OPND(m->g->strip[ss]); - if (i > 0 && i <= m->g->nsub) { - m->pmatch[i].rm_so = sp - m->offp; - } - break; - case ORPAREN: - i = OPND(m->g->strip[ss]); - if (i > 0 && i <= m->g->nsub) { - m->pmatch[i].rm_eo = sp - m->offp; - } - break; - default: /* uh oh */ - break; - } - } - - if (sp == stop) { - return sp; - } else { - return NULL; - } -} - -/* - - backref - figure out what matched what, figuring in back references - */ -static char * /* == stop (success) or NULL (failure) */ -backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, - sopno lev, int rec) /* PLUS nesting level */ -{ - int i; - sopno ss; /* start sop of current subRE */ - char *sp; /* start of string matched by it */ - sopno ssub; /* start sop of subsubRE */ - sopno esub; /* end sop of subsubRE */ - char *ssp; /* start of string matched by subsubRE */ - char *dp; - size_t len; - int hard; - sop s; - ut64 offsave; - cset *cs; - - AT("back", start, stop, startst, stopst); - sp = start; - - /* get as far as we can with easy stuff */ - hard = 0; - for (ss = startst; !hard && ss < stopst; ss++) - switch (OP(s = m->g->strip[ss])) { - case OCHAR: - if (sp == stop || *sp++ != (char)OPND(s)) - return (NULL); - break; - case OANY: - if (sp == stop) - return (NULL); - sp++; - break; - case OANYOF: - cs = &m->g->sets[OPND(s)]; - if (sp == stop || !CHIN(cs, *sp++)) - return (NULL); - break; - case OBOL: - if ((sp == m->beginp && !(m->eflags & RZ_REGEX_NOTBOL)) || - (sp < m->endp && *(sp - 1) == '\n' && - (m->g->cflags & RZ_REGEX_NEWLINE))) { /* yes */ - } else - return (NULL); - break; - case OEOL: - if ((sp == m->endp && !(m->eflags & RZ_REGEX_NOTEOL)) || - (sp < m->endp && *sp == '\n' && - (m->g->cflags & RZ_REGEX_NEWLINE))) { /* yes */ - } else - return (NULL); - break; - case OBOW: - if (((sp == m->beginp && !(m->eflags & RZ_REGEX_NOTBOL)) || - (sp < m->endp && *(sp - 1) == '\n' && - (m->g->cflags & RZ_REGEX_NEWLINE)) || - (sp > m->beginp && - !ISWORD((unsigned char)*(sp - 1)))) && - (sp < m->endp && ISWORD((unsigned char)*sp))) { /* yes */ - } else - return (NULL); - break; - case OEOW: - if (((sp == m->endp && !(m->eflags & RZ_REGEX_NOTEOL)) || - (sp < m->endp && *sp == '\n' && - (m->g->cflags & RZ_REGEX_NEWLINE)) || - (sp < m->endp && !ISWORD((unsigned char)*sp))) && - (sp > m->beginp && ISWORD((unsigned char)*(sp - 1)))) { /* yes */ - } else - return (NULL); - break; - case O_QUEST: - break; - case OOR1: /* matches null but needs to skip */ - ss++; - s = m->g->strip[ss]; - do { - if (OP(s) == OOR2) { - ss += OPND(s); - } - } while (OP(s = m->g->strip[ss]) != O_CH); - /* note that the ss++ gets us past the O_CH */ - break; - default: /* have to make a choice */ - hard = 1; - break; - } - if (!hard) { /* that was it! */ - if (sp != stop) - return (NULL); - return (sp); - } - ss--; /* adjust for the for's final increment */ - - /* the hard stuff */ - AT("hard", sp, stop, ss, stopst); - s = m->g->strip[ss]; - switch (OP(s)) { - case OBACK_: /* the vilest depths */ - i = OPND(s); - if (i > 0 && i <= m->g->nsub) { - if (m->pmatch[i].rm_eo == -1) { - return NULL; - } - } - if (m->pmatch[i].rm_so != -1) { - len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; - if (len == 0 && rec++ > MAX_RECURSION) - return (NULL); - if (stop - m->beginp >= len) { - if (sp > stop - len) { - return (NULL); /* not enough left to match */ - } - } - ssp = m->offp + m->pmatch[i].rm_so; - if (memcmp(sp, ssp, len) != 0) - return (NULL); - while (m->g->strip[ss] != SOP(O_BACK, i)) - ss++; - return (backref(m, sp + len, stop, ss + 1, stopst, lev, rec)); - } - break; - case OQUEST_: /* to null or not */ - dp = backref(m, sp, stop, ss + 1, stopst, lev, rec); - if (dp != NULL) - return (dp); /* not */ - return (backref(m, sp, stop, ss + OPND(s) + 1, stopst, lev, rec)); - break; - case OPLUS_: - if (m->lastpos && (lev + 1 <= m->g->nplus)) { - m->lastpos[lev + 1] = sp; - return (backref(m, sp, stop, ss + 1, stopst, lev + 1, rec)); - } - break; - case O_PLUS: - if (sp == m->lastpos[lev]) /* last pass matched null */ - return (backref(m, sp, stop, ss + 1, stopst, lev - 1, rec)); - /* try another pass */ - m->lastpos[lev] = sp; - dp = backref(m, sp, stop, ss - OPND(s) + 1, stopst, lev, rec); - if (!dp) - return (backref(m, sp, stop, ss + 1, stopst, lev - 1, rec)); - else - return (dp); - break; - case OCH_: /* find the right one, if any */ - ssub = ss + 1; - esub = ss + OPND(s) - 1; - if (OP(m->g->strip[esub]) != OOR1) { - break; - } - for (;;) { /* find first matching branch */ - dp = backref(m, sp, stop, ssub, esub, lev, rec); - if (dp != NULL) - return (dp); - /* that one missed, try next one */ - if (OP(m->g->strip[esub]) == O_CH) - return (NULL); /* there is none */ - esub++; - if (OP(m->g->strip[esub]) != OOR2) { - break; - } - ssub = esub + 1; - esub += OPND(m->g->strip[esub]); - if (OP(m->g->strip[esub]) == OOR2) - esub--; - else if (OP(m->g->strip[esub]) != O_CH) { - break; - } - } - break; - case OLPAREN: /* must undo assignment if rest fails */ - i = OPND(s); - if (i > 0 && i <= m->g->nsub) { - offsave = m->pmatch[i].rm_so; - m->pmatch[i].rm_so = sp - m->offp; - dp = backref(m, sp, stop, ss + 1, stopst, lev, rec); - if (dp != NULL) - return (dp); - m->pmatch[i].rm_so = offsave; - return (NULL); - } - break; - case ORPAREN: /* must undo assignment if rest fails */ - i = OPND(s); - if (i > 0 && i <= m->g->nsub) { - offsave = m->pmatch[i].rm_eo; - m->pmatch[i].rm_eo = sp - m->offp; - dp = backref(m, sp, stop, ss + 1, stopst, lev, rec); - if (dp != NULL) - return (dp); - m->pmatch[i].rm_eo = offsave; - return (NULL); - } - break; - default: /* uh oh */ - break; - } - - /* NOTREACHED */ - return NULL; -} - -/* - - fast - step through the string at top speed - */ -static char * /* where tentative match ended, or NULL */ -fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst) { - states st = m->st; - states fresh = m->fresh; - states tmp = m->tmp; - char *p = start; - int c = (start == m->beginp) ? OUT : *(start - 1); - int lastc; /* previous c */ - int flagch; - int i; - char *coldp; /* last p after which no match was underway */ - - CLEAR(st); - SET1(st, startst); - st = step(m->g, startst, stopst, st, NOTHING, st); - ASSIGN(fresh, st); - SP("start", st, *p); - coldp = NULL; - for (;;) { - /* next character */ - lastc = c; - c = (p == m->endp) ? OUT : *p; - if (EQ(st, fresh)) { - coldp = p; - } - - /* is there an EOL and/or BOL between lastc and c? */ - flagch = '\0'; - i = 0; - if ((lastc == '\n' && m->g->cflags & RZ_REGEX_NEWLINE) || - (lastc == OUT && !(m->eflags & RZ_REGEX_NOTBOL))) { - flagch = BOL; - i = m->g->nbol; - } - if ((c == '\n' && m->g->cflags & RZ_REGEX_NEWLINE) || - (c == OUT && !(m->eflags & RZ_REGEX_NOTEOL))) { - flagch = (flagch == BOL) ? BOLEOL : EOL; - i += m->g->neol; - } - if (i != 0) { - for (; i > 0; i--) - st = step(m->g, startst, stopst, st, flagch, st); - SP("boleol", st, c); - } - - /* how about a word boundary? */ - if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && - (c != OUT && ISWORD(c))) { - flagch = BOW; - } - if ((lastc != OUT && ISWORD(lastc)) && - (flagch == EOL || (c != OUT && !ISWORD(c)))) { - flagch = EOW; - } - if (flagch == BOW || flagch == EOW) { - st = step(m->g, startst, stopst, st, flagch, st); - SP("boweow", st, c); - } - - /* are we done? */ - if (ISSET(st, stopst) || p == stop) - break; /* NOTE BREAK OUT */ - - /* no, we must deal with this character */ - ASSIGN(tmp, st); - ASSIGN(st, fresh); - if (c == OUT) { - break; - } - st = step(m->g, startst, stopst, tmp, c, st); - SP("aft", st, c); - ASSIGN(tmp, st); - if (!EQ(step(m->g, startst, stopst, tmp, NOTHING, tmp), st)) { - break; - } - p++; - } - - if (coldp) { - m->coldp = coldp; - if (ISSET(st, stopst)) - return (p + 1); - } - return NULL; -} - -/* - - slow - step through the string more deliberately - */ -static char * /* where it ended */ -slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst) { - states st = m->st; - states empty = m->empty; - states tmp = m->tmp; - char *p = start; - int c = (start == m->beginp) ? OUT : *(start - 1); - int lastc; /* previous c */ - int flagch; - int i; - char *matchp; /* last p at which a match ended */ - - AT("slow", start, stop, startst, stopst); - CLEAR(st); - SET1(st, startst); - SP("sstart", st, *p); - st = step(m->g, startst, stopst, st, NOTHING, st); - matchp = NULL; - for (;;) { - /* next character */ - lastc = c; - c = (p == m->endp) ? OUT : *p; - - /* is there an EOL and/or BOL between lastc and c? */ - flagch = '\0'; - i = 0; - if ((lastc == '\n' && m->g->cflags & RZ_REGEX_NEWLINE) || - (lastc == OUT && !(m->eflags & RZ_REGEX_NOTBOL))) { - flagch = BOL; - i = m->g->nbol; - } - if ((c == '\n' && m->g->cflags & RZ_REGEX_NEWLINE) || - (c == OUT && !(m->eflags & RZ_REGEX_NOTEOL))) { - flagch = (flagch == BOL) ? BOLEOL : EOL; - i += m->g->neol; - } - if (i != 0) { - for (; i > 0; i--) - st = step(m->g, startst, stopst, st, flagch, st); - SP("sboleol", st, c); - } - - /* how about a word boundary? */ - if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && - (c != OUT && ISWORD(c))) { - flagch = BOW; - } - if ((lastc != OUT && ISWORD(lastc)) && - (flagch == EOL || (c != OUT && !ISWORD(c)))) { - flagch = EOW; - } - if (flagch == BOW || flagch == EOW) { - st = step(m->g, startst, stopst, st, flagch, st); - SP("sboweow", st, c); - } - - /* are we done? */ - if (ISSET(st, stopst)) - matchp = p; - if (EQ(st, empty) || p == stop) - break; /* NOTE BREAK OUT */ - - /* no, we must deal with this character */ - ASSIGN(tmp, st); - ASSIGN(st, empty); - if (c == OUT) { - break; - } - st = step(m->g, startst, stopst, tmp, c, st); - SP("saft", st, c); - if (!EQ(step(m->g, startst, stopst, st, NOTHING, st), st)) { - break; - } - p++; - } - - return (matchp); -} - -/* - - step - map set of states reachable before char to set reachable after - */ -static states -step(struct re_guts *g, - sopno start, /* start state within strip */ - sopno stop, /* state after stop state within strip */ - states bef, /* states reachable before */ - int ch, /* character or NONCHAR code */ - states aft) /* states already known reachable after */ -{ - cset *cs; - sop s; - sopno pc; - onestate here; /* note, macros know this name */ - sopno look; - int i; - - for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { - s = g->strip[pc]; - switch (OP(s)) { - case OEND: - break; - case OCHAR: - /* only characters can match */ - if (!NONCHAR(ch) || ch != (char)OPND(s)) { - if (ch == (char)OPND(s)) - FWD(aft, bef, 1); - } - break; - case OBOL: - if (ch == BOL || ch == BOLEOL) - FWD(aft, bef, 1); - break; - case OEOL: - if (ch == EOL || ch == BOLEOL) - FWD(aft, bef, 1); - break; - case OBOW: - if (ch == BOW) - FWD(aft, bef, 1); - break; - case OEOW: - if (ch == EOW) - FWD(aft, bef, 1); - break; - case OANY: - if (!NONCHAR(ch)) - FWD(aft, bef, 1); - break; - case OANYOF: - cs = &g->sets[OPND(s)]; - if (!NONCHAR(ch) && CHIN(cs, ch)) - FWD(aft, bef, 1); - break; - case OBACK_: /* ignored here */ - case O_BACK: - FWD(aft, aft, 1); - break; - case OPLUS_: /* forward, this is just an empty */ - FWD(aft, aft, 1); - break; - case O_PLUS: /* both forward and back */ - FWD(aft, aft, 1); - i = ISSETBACK(aft, OPND(s)); - BACK(aft, aft, OPND(s)); - if (!i && ISSETBACK(aft, OPND(s))) { - /* oho, must reconsider loop body */ - pc -= OPND(s) + 1; - INIT(here, pc); - } - break; - case OQUEST_: /* two branches, both forward */ - FWD(aft, aft, 1); - FWD(aft, aft, OPND(s)); - break; - case O_QUEST: /* just an empty */ - FWD(aft, aft, 1); - break; - case OLPAREN: /* not significant here */ - case ORPAREN: - FWD(aft, aft, 1); - break; - case OCH_: /* mark the first two branches */ - FWD(aft, aft, 1); - if ((OP(g->strip[pc + OPND(s)]) != OOR2)) { - break; - } - FWD(aft, aft, OPND(s)); - break; - case OOR1: /* done a branch, find the O_CH */ - if (ISSTATEIN(aft, here)) { - for (look = 1; - OP(s = g->strip[pc + look]) != O_CH; - look += OPND(s)) { - if (OP(s) != OOR2) { - break; - } - } - FWD(aft, aft, look); - } - break; - case OOR2: /* propagate OCH_'s marking */ - FWD(aft, aft, 1); - if (OP(g->strip[pc + OPND(s)]) != O_CH) { - if (OP(g->strip[pc + OPND(s)]) == OOR2) { - FWD(aft, aft, OPND(s)); - } - } - break; - case O_CH: /* just empty */ - FWD(aft, aft, 1); - break; - default: /* ooooops... */ - eprintf("ops in regex.c\n"); - break; - } - } - - return (aft); -} - -#ifdef REDEBUG -/* - - print - print a set of states - */ -static void -print(struct match *m, char *caption, states st, int ch, FILE *d) { - struct re_guts *g = m->g; - int i; - int first = 1; - - if (!(m->eflags & RZ_REGEX_TRACE)) - return; - - (void)fprintf(d, "%s", caption); - if (ch != '\0') - (void)fprintf(d, " %s", pchar(ch)); - for (i = 0; i < g->nstates; i++) - if (ISSET(st, i)) { - (void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i); - first = 0; - } - (void)fprintf(d, "\n"); -} - -/* - - at - print current situation - */ -static void -at(struct match *m, char *title, char *start, char *stop, sopno startst, - sopno stopst) { - if (!(m->eflags & RZ_REGEX_TRACE)) - return; - - (void)printf("%s %s-", title, pchar(*start)); - (void)printf("%s ", pchar(*stop)); - (void)printf("%ld-%ld\n", (long)startst, (long)stopst); -} - -#ifndef PCHARDONE -#define PCHARDONE /* never again */ -/* - - pchar - make a character printable - * - * Is this identical to regchar() over in debug.c? Well, yes. But a - * duplicate here avoids having a debugging-capable regexec.o tied to - * a matching debug.o, and this is convenient. It all disappears in - * the non-debug compilation anyway, so it doesn't matter much. - */ -static char * /* -> representation */ -pchar(int ch) { - static char pbuf[10]; - - if (isprint((ut8)ch) || ch == ' ') - (void)snprintf(pbuf, sizeof pbuf, "%c", ch); - else - (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch); - return (pbuf); -} -#endif -#endif - -#undef matcher -#undef fast -#undef slow -#undef dissect -#undef backref -#undef step -#undef print -#undef at -#undef match -#undef nope diff --git a/librz/util/regex/re_format.7 b/librz/util/regex/re_format.7 deleted file mode 100644 index 72887175209..00000000000 --- a/librz/util/regex/re_format.7 +++ /dev/null @@ -1,756 +0,0 @@ -.\" $OpenBSD: re_format.7,v 1.15 2010/07/15 20:51:38 schwarze Exp $ -.\" -.\" Copyright (c) 1997, Phillip F Knaack. All rights reserved. -.\" -.\" Copyright (c) 1992, 1993, 1994 Henry Spencer. -.\" Copyright (c) 1992, 1993, 1994 -.\" The Regents of the University of California. All rights reserved. -.\" -.\" This code is derived from software contributed to Berkeley by -.\" Henry Spencer. -.\" -.\" Redistribution and use in source and binary forms, with or without -.\" modification, are permitted provided that the following conditions -.\" are met: -.\" 1. Redistributions of source code must retain the above copyright -.\" notice, this list of conditions and the following disclaimer. -.\" 2. Redistributions in binary form must reproduce the above copyright -.\" notice, this list of conditions and the following disclaimer in the -.\" documentation and/or other materials provided with the distribution. -.\" 3. Neither the name of the University nor the names of its contributors -.\" may be used to endorse or promote products derived from this software -.\" without specific prior written permission. -.\" -.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND -.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -.\" SUCH DAMAGE. -.\" -.\" @(#)re_format.7 8.3 (Berkeley) 3/20/94 -.\" -.Dd $Mdocdate: July 15 2010 $ -.Dt RE_FORMAT 7 -.Os -.Sh NAME -.Nm re_format -.Nd POSIX regular expressions -.Sh DESCRIPTION -Regular expressions (REs), -as defined in -.St -p1003.1-2004 , -come in two forms: -basic regular expressions -(BREs) -and extended regular expressions -(EREs). -Both forms of regular expressions are supported -by the interfaces described in -.Xr regex 3 . -Applications dealing with regular expressions -may use one or the other form -(or indeed both). -For example, -.Xr ed 1 -uses BREs, -whilst -.Xr egrep 1 -talks EREs. -Consult the manual page for the specific application to find out which -it uses. -.Pp -POSIX leaves some aspects of RE syntax and semantics open; -.Sq ** -marks decisions on these aspects that -may not be fully portable to other POSIX implementations. -.Pp -This manual page first describes regular expressions in general, -specifically extended regular expressions, -and then discusses differences between them and basic regular expressions. -.Sh EXTENDED REGULAR EXPRESSIONS -An ERE is one** or more non-empty** -.Em branches , -separated by -.Sq \*(Ba . -It matches anything that matches one of the branches. -.Pp -A branch is one** or more -.Em pieces , -concatenated. -It matches a match for the first, followed by a match for the second, etc. -.Pp -A piece is an -.Em atom -possibly followed by a single** -.Sq * , -.Sq + , -.Sq ?\& , -or -.Em bound . -An atom followed by -.Sq * -matches a sequence of 0 or more matches of the atom. -An atom followed by -.Sq + -matches a sequence of 1 or more matches of the atom. -An atom followed by -.Sq ?\& -matches a sequence of 0 or 1 matches of the atom. -.Pp -A bound is -.Sq { -followed by an unsigned decimal integer, -possibly followed by -.Sq ,\& -possibly followed by another unsigned decimal integer, -always followed by -.Sq } . -The integers must lie between 0 and -.Dv RE_DUP_MAX -(255**) inclusive, -and if there are two of them, the first may not exceed the second. -An atom followed by a bound containing one integer -.Ar i -and no comma matches -a sequence of exactly -.Ar i -matches of the atom. -An atom followed by a bound -containing one integer -.Ar i -and a comma matches -a sequence of -.Ar i -or more matches of the atom. -An atom followed by a bound -containing two integers -.Ar i -and -.Ar j -matches a sequence of -.Ar i -through -.Ar j -(inclusive) matches of the atom. -.Pp -An atom is a regular expression enclosed in -.Sq () -(matching a part of the regular expression), -an empty set of -.Sq () -(matching the null string)**, -a -.Em bracket expression -(see below), -.Sq .\& -(matching any single character), -.Sq ^ -(matching the null string at the beginning of a line), -.Sq $ -(matching the null string at the end of a line), -a -.Sq \e -followed by one of the characters -.Sq ^.[$()|*+?{\e -(matching that character taken as an ordinary character), -a -.Sq \e -followed by any other character** -(matching that character taken as an ordinary character, -as if the -.Sq \e -had not been present**), -or a single character with no other significance (matching that character). -A -.Sq { -followed by a character other than a digit is an ordinary character, -not the beginning of a bound**. -It is illegal to end an RE with -.Sq \e . -.Pp -A bracket expression is a list of characters enclosed in -.Sq [] . -It normally matches any single character from the list (but see below). -If the list begins with -.Sq ^ , -it matches any single character -.Em not -from the rest of the list -(but see below). -If two characters in the list are separated by -.Sq - , -this is shorthand for the full -.Em range -of characters between those two (inclusive) in the -collating sequence, e.g.\& -.Sq [0-9] -in ASCII matches any decimal digit. -It is illegal** for two ranges to share an endpoint, e.g.\& -.Sq a-c-e . -Ranges are very collating-sequence-dependent, -and portable programs should avoid relying on them. -.Pp -To include a literal -.Sq ]\& -in the list, make it the first character -(following a possible -.Sq ^ ) . -To include a literal -.Sq - , -make it the first or last character, -or the second endpoint of a range. -To use a literal -.Sq - -as the first endpoint of a range, -enclose it in -.Sq [. -and -.Sq .] -to make it a collating element (see below). -With the exception of these and some combinations using -.Sq \&[ -(see next paragraphs), -all other special characters, including -.Sq \e , -lose their special significance within a bracket expression. -.Pp -Within a bracket expression, a collating element -(a character, -a multi-character sequence that collates as if it were a single character, -or a collating-sequence name for either) -enclosed in -.Sq [. -and -.Sq .] -stands for the sequence of characters of that collating element. -The sequence is a single element of the bracket expression's list. -A bracket expression containing a multi-character collating element -can thus match more than one character, -e.g. if the collating sequence includes a -.Sq ch -collating element, -then the RE -.Sq [[.ch.]]*c -matches the first five characters of -.Sq chchcc . -.Pp -Within a bracket expression, a collating element enclosed in -.Sq [= -and -.Sq =] -is an equivalence class, standing for the sequences of characters -of all collating elements equivalent to that one, including itself. -(If there are no other equivalent collating elements, -the treatment is as if the enclosing delimiters were -.Sq [. -and -.Sq .] . ) -For example, if -.Sq x -and -.Sq y -are the members of an equivalence class, -then -.Sq [[=x=]] , -.Sq [[=y=]] , -and -.Sq [xy] -are all synonymous. -An equivalence class may not** be an endpoint of a range. -.Pp -Within a bracket expression, the name of a -.Em character class -enclosed -in -.Sq [: -and -.Sq :] -stands for the list of all characters belonging to that class. -Standard character class names are: -.Bd -literal -offset indent -alnum digit punct -alpha graph space -blank lower upper -cntrl print xdigit -.Ed -.Pp -These stand for the character classes defined in -.Xr ctype 3 . -A locale may provide others. -A character class may not be used as an endpoint of a range. -.Pp -There are two special cases** of bracket expressions: -the bracket expressions -.Sq [[:<:]] -and -.Sq [[:>:]] -match the null string at the beginning and end of a word, respectively. -A word is defined as a sequence of -characters starting and ending with a word character -which is neither preceded nor followed by -word characters. -A word character is an -.Em alnum -character (as defined by -.Xr ctype 3 ) -or an underscore. -This is an extension, -compatible with but not specified by POSIX, -and should be used with -caution in software intended to be portable to other systems. -.Pp -In the event that an RE could match more than one substring of a given -string, -the RE matches the one starting earliest in the string. -If the RE could match more than one substring starting at that point, -it matches the longest. -Subexpressions also match the longest possible substrings, subject to -the constraint that the whole match be as long as possible, -with subexpressions starting earlier in the RE taking priority over -ones starting later. -Note that higher-level subexpressions thus take priority over -their lower-level component subexpressions. -.Pp -Match lengths are measured in characters, not collating elements. -A null string is considered longer than no match at all. -For example, -.Sq bb* -matches the three middle characters of -.Sq abbbc ; -.Sq (wee|week)(knights|nights) -matches all ten characters of -.Sq weeknights ; -when -.Sq (.*).* -is matched against -.Sq abc , -the parenthesized subexpression matches all three characters; -and when -.Sq (a*)* -is matched against -.Sq bc , -both the whole RE and the parenthesized subexpression match the null string. -.Pp -If case-independent matching is specified, -the effect is much as if all case distinctions had vanished from the -alphabet. -When an alphabetic that exists in multiple cases appears as an -ordinary character outside a bracket expression, it is effectively -transformed into a bracket expression containing both cases, -e.g.\& -.Sq x -becomes -.Sq [xX] . -When it appears inside a bracket expression, -all case counterparts of it are added to the bracket expression, -so that, for example, -.Sq [x] -becomes -.Sq [xX] -and -.Sq [^x] -becomes -.Sq [^xX] . -.Pp -No particular limit is imposed on the length of REs**. -Programs intended to be portable should not employ REs longer -than 256 bytes, -as an implementation can refuse to accept such REs and remain -POSIX-compliant. -.Pp -The following is a list of extended regular expressions: -.Bl -tag -width Ds -.It Ar c -Any character -.Ar c -not listed below matches itself. -.It \e Ns Ar c -Any backslash-escaped character -.Ar c -matches itself. -.It \&. -Matches any single character that is not a newline -.Pq Sq \en . -.It Bq Ar char-class -Matches any single character in -.Ar char-class . -To include a -.Ql \&] -in -.Ar char-class , -it must be the first character. -A range of characters may be specified by separating the end characters -of the range with a -.Ql - ; -e.g.\& -.Ar a-z -specifies the lower case characters. -The following literal expressions can also be used in -.Ar char-class -to specify sets of characters: -.Bd -unfilled -offset indent -[:alnum:] [:cntrl:] [:lower:] [:space:] -[:alpha:] [:digit:] [:print:] [:upper:] -[:blank:] [:graph:] [:punct:] [:xdigit:] -.Ed -.Pp -If -.Ql - -appears as the first or last character of -.Ar char-class , -then it matches itself. -All other characters in -.Ar char-class -match themselves. -.Pp -Patterns in -.Ar char-class -of the form -.Eo [. -.Ar col-elm -.Ec .]\& -or -.Eo [= -.Ar col-elm -.Ec =]\& , -where -.Ar col-elm -is a collating element, are interpreted according to -.Xr setlocale 3 -.Pq not currently supported . -.It Bq ^ Ns Ar char-class -Matches any single character, other than newline, not in -.Ar char-class . -.Ar char-class -is defined as above. -.It ^ -If -.Sq ^ -is the first character of a regular expression, then it -anchors the regular expression to the beginning of a line. -Otherwise, it matches itself. -.It $ -If -.Sq $ -is the last character of a regular expression, -it anchors the regular expression to the end of a line. -Otherwise, it matches itself. -.It [[:<:]] -Anchors the single character regular expression or subexpression -immediately following it to the beginning of a word. -.It [[:>:]] -Anchors the single character regular expression or subexpression -immediately following it to the end of a word. -.It Pq Ar re -Defines a subexpression -.Ar re . -Any set of characters enclosed in parentheses -matches whatever the set of characters without parentheses matches -(that is a long-winded way of saying the constructs -.Sq (re) -and -.Sq re -match identically). -.It * -Matches the single character regular expression or subexpression -immediately preceding it zero or more times. -If -.Sq * -is the first character of a regular expression or subexpression, -then it matches itself. -The -.Sq * -operator sometimes yields unexpected results. -For example, the regular expression -.Ar b* -matches the beginning of the string -.Qq abbb -(as opposed to the substring -.Qq bbb ) , -since a null match is the only leftmost match. -.It + -Matches the singular character regular expression -or subexpression immediately preceding it -one or more times. -.It ? -Matches the singular character regular expression -or subexpression immediately preceding it -0 or 1 times. -.Sm off -.It Xo -.Pf { Ar n , m No }\ \& -.Pf { Ar n , No }\ \& -.Pf { Ar n No } -.Xc -.Sm on -Matches the single character regular expression or subexpression -immediately preceding it at least -.Ar n -and at most -.Ar m -times. -If -.Ar m -is omitted, then it matches at least -.Ar n -times. -If the comma is also omitted, then it matches exactly -.Ar n -times. -.It \*(Ba -Used to separate patterns. -For example, -the pattern -.Sq cat\*(Badog -matches either -.Sq cat -or -.Sq dog . -.El -.Sh BASIC REGULAR EXPRESSIONS -Basic regular expressions differ in several respects: -.Bl -bullet -offset 3n -.It -.Sq \*(Ba , -.Sq + , -and -.Sq ?\& -are ordinary characters and there is no equivalent -for their functionality. -.It -The delimiters for bounds are -.Sq \e{ -and -.Sq \e} , -with -.Sq { -and -.Sq } -by themselves ordinary characters. -.It -The parentheses for nested subexpressions are -.Sq \e( -and -.Sq \e) , -with -.Sq \&( -and -.Sq )\& -by themselves ordinary characters. -.It -.Sq ^ -is an ordinary character except at the beginning of the -RE or** the beginning of a parenthesized subexpression. -.It -.Sq $ -is an ordinary character except at the end of the -RE or** the end of a parenthesized subexpression. -.It -.Sq * -is an ordinary character if it appears at the beginning of the -RE or the beginning of a parenthesized subexpression -(after a possible leading -.Sq ^ ) . -.It -Finally, there is one new type of atom, a -.Em back-reference : -.Sq \e -followed by a non-zero decimal digit -.Ar d -matches the same sequence of characters matched by the -.Ar d Ns th -parenthesized subexpression -(numbering subexpressions by the positions of their opening parentheses, -left to right), -so that, for example, -.Sq \e([bc]\e)\e1 -matches -.Sq bb\& -or -.Sq cc -but not -.Sq bc . -.El -.Pp -The following is a list of basic regular expressions: -.Bl -tag -width Ds -.It Ar c -Any character -.Ar c -not listed below matches itself. -.It \e Ns Ar c -Any backslash-escaped character -.Ar c , -except for -.Sq { , -.Sq } , -.Sq \&( , -and -.Sq \&) , -matches itself. -.It \&. -Matches any single character that is not a newline -.Pq Sq \en . -.It Bq Ar char-class -Matches any single character in -.Ar char-class . -To include a -.Ql \&] -in -.Ar char-class , -it must be the first character. -A range of characters may be specified by separating the end characters -of the range with a -.Ql - ; -e.g.\& -.Ar a-z -specifies the lower case characters. -The following literal expressions can also be used in -.Ar char-class -to specify sets of characters: -.Bd -unfilled -offset indent -[:alnum:] [:cntrl:] [:lower:] [:space:] -[:alpha:] [:digit:] [:print:] [:upper:] -[:blank:] [:graph:] [:punct:] [:xdigit:] -.Ed -.Pp -If -.Ql - -appears as the first or last character of -.Ar char-class , -then it matches itself. -All other characters in -.Ar char-class -match themselves. -.Pp -Patterns in -.Ar char-class -of the form -.Eo [. -.Ar col-elm -.Ec .]\& -or -.Eo [= -.Ar col-elm -.Ec =]\& , -where -.Ar col-elm -is a collating element, are interpreted according to -.Xr setlocale 3 -.Pq not currently supported . -.It Bq ^ Ns Ar char-class -Matches any single character, other than newline, not in -.Ar char-class . -.Ar char-class -is defined as above. -.It ^ -If -.Sq ^ -is the first character of a regular expression, then it -anchors the regular expression to the beginning of a line. -Otherwise, it matches itself. -.It $ -If -.Sq $ -is the last character of a regular expression, -it anchors the regular expression to the end of a line. -Otherwise, it matches itself. -.It [[:<:]] -Anchors the single character regular expression or subexpression -immediately following it to the beginning of a word. -.It [[:>:]] -Anchors the single character regular expression or subexpression -immediately following it to the end of a word. -.It \e( Ns Ar re Ns \e) -Defines a subexpression -.Ar re . -Subexpressions may be nested. -A subsequent backreference of the form -.Pf \e Ns Ar n , -where -.Ar n -is a number in the range [1,9], expands to the text matched by the -.Ar n Ns th -subexpression. -For example, the regular expression -.Ar \e(.*\e)\e1 -matches any string consisting of identical adjacent substrings. -Subexpressions are ordered relative to their left delimiter. -.It * -Matches the single character regular expression or subexpression -immediately preceding it zero or more times. -If -.Sq * -is the first character of a regular expression or subexpression, -then it matches itself. -The -.Sq * -operator sometimes yields unexpected results. -For example, the regular expression -.Ar b* -matches the beginning of the string -.Qq abbb -(as opposed to the substring -.Qq bbb ) , -since a null match is the only leftmost match. -.Sm off -.It Xo -.Pf \e{ Ar n , m No \e}\ \& -.Pf \e{ Ar n , No \e}\ \& -.Pf \e{ Ar n No \e} -.Xc -.Sm on -Matches the single character regular expression or subexpression -immediately preceding it at least -.Ar n -and at most -.Ar m -times. -If -.Ar m -is omitted, then it matches at least -.Ar n -times. -If the comma is also omitted, then it matches exactly -.Ar n -times. -.El -.Sh SEE ALSO -.Xr ctype 3 , -.Xr regex 3 -.Sh STANDARDS -.St -p1003.1-2004 : -Base Definitions, Chapter 9 (Regular Expressions). -.Sh BUGS -Having two kinds of REs is a botch. -.Pp -The current POSIX spec says that -.Sq )\& -is an ordinary character in the absence of an unmatched -.Sq \&( ; -this was an unintentional result of a wording error, -and change is likely. -Avoid relying on it. -.Pp -Back-references are a dreadful botch, -posing major problems for efficient implementations. -They are also somewhat vaguely defined -(does -.Sq a\e(\e(b\e)*\e2\e)*d -match -.Sq abbbd ? ) . -Avoid using them. -.Pp -POSIX's specification of case-independent matching is vague. -The -.Dq one case implies all cases -definition given above -is the current consensus among implementors as to the right interpretation. -.Pp -The syntax for word boundaries is incredibly ugly. diff --git a/librz/util/regex/regcomp.c b/librz/util/regex/regcomp.c deleted file mode 100644 index 0962d3c24d6..00000000000 --- a/librz/util/regex/regcomp.c +++ /dev/null @@ -1,1786 +0,0 @@ -/* $OpenBSD: regcomp.c,v 1.20 2010/11/21 00:02:30 tedu Exp $ */ -/*- - * Copyright (c) 1992, 1993, 1994 Henry Spencer. - * Copyright (c) 1992, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Henry Spencer. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 - */ - -#include -#include -#include -#include -#include -#include -#include "rz_regex.h" -#include "rz_util/rz_str.h" -#include "rz_util/rz_assert.h" - -#include "utils.h" -#include "regex2.h" - -#include "cclass.h" -#include "cname.h" - -/* - * parse structure, passed up and down to avoid global variables and - * other clumsinesses - */ -struct parse { - char *next; /* next character in RE */ - char *end; /* end of string (-> NUL normally) */ - int error; /* has an error been seen? */ - sop *strip; /* malloced strip */ - sopno ssize; /* malloced strip size (allocated) */ - sopno slen; /* malloced strip length (used) */ - int ncsalloc; /* number of csets allocated */ - struct re_guts *g; -#define NPAREN 10 /* we need to remember () 1-9 for back refs */ - sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ - sopno pend[NPAREN]; /* -> ) ([0] unused) */ -}; - -static void p_ere(struct parse *, int); -static void p_ere_exp(struct parse *); -static void p_str(struct parse *); -static void p_bre(struct parse *, int, int); -static int p_simp_re(struct parse *, int); -static int p_count(struct parse *); -static void p_bracket(struct parse *); -static void p_b_term(struct parse *, cset *); -static void p_b_cclass(struct parse *, cset *); -static void p_b_eclass(struct parse *, cset *); -static char p_b_symbol(struct parse *); -static char p_b_coll_elem(struct parse *, int); -static char othercase(int); -static void bothcases(struct parse *, int); -static void ordinary(struct parse *, int); -static void special(struct parse *, int); -static void nonnewline(struct parse *); -static void repeat(struct parse *, sopno, int, int); -static int seterr(struct parse *, int); -static cset *allocset(struct parse *); -static void freeset(struct parse *, cset *); -static int freezeset(struct parse *, cset *); -static int firstch(struct parse *, cset *); -static int nch(struct parse *, cset *); -static void mcadd(struct parse *, cset *, char *); -static void mcinvert(struct parse *, cset *); -static void mccase(struct parse *, cset *); -static int isinsets(struct re_guts *, int); -static int samesets(struct re_guts *, int, int); -static void categorize(struct parse *, struct re_guts *); -static sopno dupl(struct parse *, sopno, sopno); -static void doemit(struct parse *, sop, size_t); -static void doinsert(struct parse *, sop, size_t, sopno); -static void dofwd(struct parse *, sopno, sop); -static void enlarge(struct parse *, sopno); -static void stripsnug(struct parse *, struct re_guts *); -static void findmust(struct parse *, struct re_guts *); -static sopno pluscount(struct parse *, struct re_guts *); - -static char nuls[10]; /* place to point scanner in event of error */ - -/* - * macros for use with parse structure - * BEWARE: these know that the parse structure is named `p' !!! - */ -#define PEEK() (*p->next) -#define PEEK2() (*(p->next + 1)) -#define MORE() (p->next < p->end) -#define MORE2() (p->next + 1 < p->end) -#define SEE(c) (MORE() && PEEK() == (c)) -#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) -#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) -#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) -#define NEXT() (p->next++) -#define NEXT2() (p->next += 2) -#define NEXTn(n) (p->next += (n)) -#define GETNEXT() (*p->next++) -#define SETERROR(e) seterr(p, (e)) -#define REQUIRE(co, e) (void)((co) || SETERROR(e)) -#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) -#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) -#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) -#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) -#define INSERT(op, pos) doinsert(p, (sop)(op), HERE() - (pos) + 1, pos) -#define AHEAD(pos) dofwd(p, pos, HERE() - (pos)) -#define ASTERN(sop, pos) EMIT(sop, HERE() - (pos)) -#define HERE() (p->slen) -#define THERE() (p->slen - 1) -#define THERETHERE() (p->slen - 2) -#define DROP(n) (p->slen -= (n)) - -RZ_API int rz_regex_match(const char *pattern, const char *flags, const char *text) { - int ret; - RzRegex rx; - int re_flags = rz_regex_flags(flags); - if (rz_regex_comp(&rx, pattern, re_flags)) { - eprintf("FAIL TO COMPILE %s\n", pattern); - return 0; - } - ret = rz_regex_exec(&rx, text, 0, 0, re_flags); - rz_regex_fini(&rx); - return ret ? 0 : 1; -} - -/** - * Extract the string matched by given regex match - * - * \param str must be the exact string \p match was originally matched from - * \param match a match pointing into \p str, may be -1/-1 (not found) in which case NULL will be returned - * \return a heap-allocated string representing the contents of \p match or NULL if unmatched - */ -RZ_API char *rz_regex_match_extract(RZ_NONNULL const char *str, RZ_NONNULL RzRegexMatch *match) { - rz_return_val_if_fail(str && match, NULL); - if (match->rm_eo < 0 || match->rm_so < 0) { - return NULL; - } - size_t entry_len = match->rm_eo - match->rm_so + 1; - char *r = RZ_NEWS0(char, entry_len); - if (!r) { - return NULL; - } - rz_str_ncpy(r, str + match->rm_so, entry_len); - return r; -} - -RZ_API RzList /**/ *rz_regex_get_match_list(const char *pattern, const char *flags, const char *text) { - RzList *list = rz_list_newf(free); - RzRegex rx; - RzRegexMatch match; - int re_flags = rz_regex_flags(flags); - if (rz_regex_comp(&rx, pattern, re_flags)) { - eprintf("Failed to compile regexp: %s\n", pattern); - return NULL; - } - - /* Initialize the boundaries for RZ_REGEX_STARTEND */ - match.rm_so = 0; - match.rm_eo = strlen(text); - while (!rz_regex_exec(&rx, text, 1, &match, re_flags | RZ_REGEX_STARTEND)) { - char *entry = rz_regex_match_extract(text, &match); - if (entry) { - rz_list_append(list, entry); - } - /* Update the boundaries for RZ_REGEX_STARTEND */ - match.rm_so = match.rm_eo; - match.rm_eo = strlen(text); - } - rz_regex_fini(&rx); - return list; -} - -RZ_API RzRegex *rz_regex_new(const char *pattern, const char *flags) { - rz_return_val_if_fail(pattern, NULL); - RzRegex *r, rx = { 0 }; - if (rz_regex_comp(&rx, pattern, rz_regex_flags(flags))) { - return NULL; - } - r = RZ_NEW(RzRegex); - if (!r) { - return NULL; - } - memcpy(r, &rx, sizeof(RzRegex)); - return r; -} - -RZ_API int rz_regex_flags(const char *f) { - int flags = 0; - if (!f || !*f) { - return 0; - } - if (strchr(f, 'e')) { - flags |= RZ_REGEX_EXTENDED; - } - if (strchr(f, 'i')) { - flags |= RZ_REGEX_ICASE; - } - if (strchr(f, 's')) { - flags |= RZ_REGEX_NOSUB; - } - if (strchr(f, 'n')) { - flags |= RZ_REGEX_NEWLINE; - } - if (strchr(f, 'N')) { - flags |= RZ_REGEX_NOSPEC; - } - if (strchr(f, 'p')) { - flags |= RZ_REGEX_PEND; - } - if (strchr(f, 'd')) { - flags |= RZ_REGEX_DUMP; - } - return flags; -} - -RZ_API void rz_regex_fini(RzRegex *preg) { - struct re_guts *g; - if (!preg) { - return; - } - if (preg->re_magic != MAGIC1) { /* oops */ - return; /* nice to complain, but hard */ - } - - g = preg->re_g; - if (!g || g->magic != MAGIC2) { /* oops again */ - return; - } - preg->re_magic = 0; /* mark it invalid */ - g->magic = 0; /* mark it invalid */ - - free(g->strip); - free(g->sets); - free(g->setbits); - free(g->must); - free(g); -} - -RZ_API void rz_regex_free(RzRegex *preg) { - rz_regex_fini(preg); - free(preg); -} - -/* - - regcomp - interface for parser and compilation - - 0 success, otherwise RZ_REGEX_something - */ -RZ_API int rz_regex_comp(RzRegex *preg, const char *pattern, int cflags) { - struct parse pa; - struct re_guts *g; - struct parse *p = &pa; - int i; - size_t len; -#ifdef REDEBUG -#define GOODFLAGS(f) (f) -#else -#define GOODFLAGS(f) ((f) & ~RZ_REGEX_DUMP) -#endif - cflags = GOODFLAGS(cflags); - if (!preg || ((cflags & RZ_REGEX_EXTENDED) && (cflags & RZ_REGEX_NOSPEC))) { - return RZ_REGEX_INVARG; - } - if (cflags & RZ_REGEX_PEND) { - if (preg->re_endp < pattern) { - return RZ_REGEX_INVARG; - } - len = preg->re_endp - pattern; - } else { - len = strlen((char *)pattern); - } - /* do the mallocs early so failure handling is easy */ - g = calloc(1, sizeof(struct re_guts) + (NC - 1)); - if (!g) { - return RZ_REGEX_ESPACE; - } - /* - * Limit the pattern space to avoid a 32-bit overflow on buffer - * extension. Also avoid any signed overflow in case of conversion - * so make the real limit based on a 31-bit overflow. - * - * Likely not applicable on 64-bit systems but handle the case - * generically (who are we to stop people from using ~715MB+ - * patterns?). - */ - size_t maxlen = ((size_t)-1 >> 1) / sizeof(sop) * 2 / 3; - if (len >= maxlen) { - free(g); - return RZ_REGEX_ESPACE; - } - preg->re_flags = cflags; - p->ssize = len / (size_t)2 * (size_t)3 + (size_t)1; /* ugh */ - if (p->ssize < len) { - free(g); - return RZ_REGEX_ESPACE; - } - - p->strip = (sop *)calloc(p->ssize, sizeof(sop)); - if (!p->strip) { - free(g); - return RZ_REGEX_ESPACE; - } - p->slen = 0; - if (!p->strip) { - free(g); - return RZ_REGEX_ESPACE; - } - - /* set things up */ - p->g = g; - p->next = (char *)pattern; /* convenience; we do not modify it */ - p->end = p->next + len; - p->error = 0; - p->ncsalloc = 0; - for (i = 0; i < NPAREN; i++) { - p->pbegin[i] = 0; - p->pend[i] = 0; - } - g->csetsize = NC; - g->sets = NULL; - g->setbits = NULL; - g->ncsets = 0; - g->cflags = cflags; - g->iflags = 0; - g->nbol = 0; - g->neol = 0; - g->must = NULL; - g->mlen = 0; - g->nsub = 0; - g->ncategories = 1; /* category 0 is "everything else" */ - g->categories = &g->catspace[-(CHAR_MIN)]; - (void)memset((char *)g->catspace, 0, NC * sizeof(cat_t)); - g->backrefs = 0; - - /* do it */ - EMIT(OEND, 0); - g->firststate = THERE(); - if (cflags & RZ_REGEX_EXTENDED) { - p_ere(p, OUT); - } else if (cflags & RZ_REGEX_NOSPEC) { - p_str(p); - } else { - p_bre(p, OUT, OUT); - } - EMIT(OEND, 0); - g->laststate = THERE(); - - /* tidy up loose ends and fill things in */ - categorize(p, g); - stripsnug(p, g); - findmust(p, g); - g->nplus = pluscount(p, g); - g->magic = MAGIC2; - preg->re_nsub = g->nsub; - preg->re_g = g; - preg->re_magic = MAGIC1; -#ifndef REDEBUG - /* not debugging, so can't rely on the asssert() in regexec() */ - if (g->iflags & BAD) { - SETERROR(RZ_REGEX_ASSERT); - } -#endif - if (p->error) { - rz_regex_fini(preg); - } - return p->error; -} - -/* - - p_ere - ERE parser top level, concatenation and alternation - */ -static void p_ere(struct parse *p, int stop) { /* character this ERE should end at */ - bool isFirst = true; - sopno prevback = 0; - sopno prevfwd = 0; - sopno conc = 0; - char c; - - for (;;) { - /* do a bunch of concatenated expressions */ - conc = HERE(); - while (MORE() && (c = PEEK()) != '|' && c != stop) { - p_ere_exp(p); - } - REQUIRE(HERE() != conc, RZ_REGEX_EMPTY); /* require nonempty */ - - if (!EAT('|')) { - break; /* NOTE BREAK OUT */ - } - if (isFirst) { - INSERT(OCH_, conc); /* offset is wrong */ - prevfwd = conc; - prevback = conc; - isFirst = false; - } - ASTERN(OOR1, prevback); - prevback = THERE(); - AHEAD(prevfwd); /* fix previous offset */ - prevfwd = HERE(); - EMIT(OOR2, 0); /* offset is very wrong */ - } - - if (!isFirst) { /* tail-end fixups */ - AHEAD(prevfwd); - ASTERN(O_CH, prevback); - } - // asert(!MORE() || SEE(stop)); -} - -/* - - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op - */ -static void p_ere_exp(struct parse *p) { - char c; - sopno pos; - int count; - int count2; - sopno subno; - int wascaret = 0; - - if (!MORE()) { /* caller should have ensured this */ - return; - } - c = GETNEXT(); - - pos = HERE(); - switch (c) { - case '(': - REQUIRE(MORE(), RZ_REGEX_EPAREN); - p->g->nsub++; - subno = p->g->nsub; - if (subno < NPAREN) { - p->pbegin[subno] = HERE(); - } - EMIT(OLPAREN, subno); - if (!SEE(')')) { - p_ere(p, ')'); - } - if (subno < NPAREN) { - p->pend[subno] = HERE(); - if (!p->pend[subno]) { - break; - } - } - EMIT(ORPAREN, subno); - MUSTEAT(')', RZ_REGEX_EPAREN); - break; - case '^': - EMIT(OBOL, 0); - p->g->iflags |= USEBOL; - p->g->nbol++; - wascaret = 1; - break; - case '$': - EMIT(OEOL, 0); - p->g->iflags |= USEEOL; - p->g->neol++; - break; - case '|': - SETERROR(RZ_REGEX_EMPTY); - break; - case '*': - case '+': - case '?': - SETERROR(RZ_REGEX_BADRPT); - break; - case '.': - if (p->g->cflags & RZ_REGEX_NEWLINE) { - nonnewline(p); - } else { - EMIT(OANY, 0); - } - break; - case '[': - p_bracket(p); - break; - case '\\': - REQUIRE(MORE(), RZ_REGEX_EESCAPE); - c = GETNEXT(); - if (!isalpha(c)) { - ordinary(p, c); - } else { - special(p, c); - } - break; - case '{': /* okay as ordinary except if digit follows */ - REQUIRE(!MORE() || !isdigit((ut8)PEEK()), RZ_REGEX_BADRPT); - /* FALLTHROUGH */ - default: - ordinary(p, c); - break; - } - - if (!MORE()) { - return; - } - c = PEEK(); - /* we call { a repetition if followed by a digit */ - if (!(c == '*' || c == '+' || c == '?' || - (c == '{' && MORE2() && isdigit((ut8)PEEK2())))) { - return; /* no repetition, we're done */ - } - NEXT(); - - REQUIRE(!wascaret, RZ_REGEX_BADRPT); - switch (c) { - case '*': /* implemented as +? */ - /* this case does not require the (y|) trick, noKLUDGE */ - INSERT(OPLUS_, pos); - ASTERN(O_PLUS, pos); - INSERT(OQUEST_, pos); - ASTERN(O_QUEST, pos); - break; - case '+': - INSERT(OPLUS_, pos); - ASTERN(O_PLUS, pos); - break; - case '?': - /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ - INSERT(OCH_, pos); /* offset slightly wrong */ - ASTERN(OOR1, pos); /* this one's right */ - AHEAD(pos); /* fix the OCH_ */ - EMIT(OOR2, 0); /* offset very wrong... */ - AHEAD(THERE()); /* ...so fix it */ - ASTERN(O_CH, THERETHERE()); - break; - case '{': - count = p_count(p); - if (EAT(',')) { - if (isdigit((ut8)PEEK())) { - count2 = p_count(p); - REQUIRE(count <= count2, RZ_REGEX_BADBR); - } else { /* single number with comma */ - count2 = INTFINITY; - } - } else { /* just a single number */ - count2 = count; - } - repeat(p, pos, count, count2); - if (!EAT('}')) { /* error heuristics */ - while (MORE() && PEEK() != '}') { - NEXT(); - } - REQUIRE(MORE(), RZ_REGEX_EBRACE); - SETERROR(RZ_REGEX_BADBR); - } - break; - } - - if (!MORE()) { - return; - } - c = PEEK(); - if (!(c == '*' || c == '+' || c == '?' || - (c == '{' && MORE2() && isdigit((ut8)PEEK2())))) { - return; - } - SETERROR(RZ_REGEX_BADRPT); -} - -/* - - p_str - string (no metacharacters) "parser" - */ -static void p_str(struct parse *p) { - REQUIRE(MORE(), RZ_REGEX_EMPTY); - while (MORE()) { - ordinary(p, GETNEXT()); - } -} - -/* - - p_bre - BRE parser top level, anchoring and concatenation - * Giving end1 as OUT essentially eliminates the end1/end2 check. - * - * This implementation is a bit of a kludge, in that a trailing $ is first - * taken as an ordinary character and then revised to be an anchor. The - * only undesirable side effect is that '$' gets included as a character - * category in such cases. This is fairly harmless; not worth fixing. - * The amount of lookahead needed to avoid this kludge is excessive. - */ -static void p_bre(struct parse *p, - int end1, /* first terminating character */ - int end2) /* second terminating character */ -{ - sopno start = HERE(); - int first = 1; /* first subexpression? */ - int wasdollar = 0; - - if (EAT('^')) { - EMIT(OBOL, 0); - p->g->iflags |= USEBOL; - p->g->nbol++; - } - while (MORE() && !SEETWO(end1, end2)) { - wasdollar = p_simp_re(p, first); - first = 0; - } - if (wasdollar) { /* oops, that was a trailing anchor */ - DROP(1); - EMIT(OEOL, 0); - p->g->iflags |= USEEOL; - p->g->neol++; - } - - REQUIRE(HERE() != start, RZ_REGEX_EMPTY); /* require nonempty */ -} - -/* - - p_simp_re - parse a simple RE, an atom possibly followed by a repetition - */ -static int /* was the simple RE an unbackslashed $? */ -p_simp_re(struct parse *p, - int starordinary) /* is a leading * an ordinary character? */ -{ - int c; - int count; - int count2; - sopno pos; - int i; - sopno subno; -#define BACKSL (1 << CHAR_BIT) - - pos = HERE(); /* repetion op, if any, covers from here */ - - if (!MORE()) { /* caller should have ensured this */ - return 0; - } - c = GETNEXT(); - if (c == '\\') { - REQUIRE(MORE(), RZ_REGEX_EESCAPE); - c = BACKSL | GETNEXT(); - } - switch (c) { - case '.': - if (p->g->cflags & RZ_REGEX_NEWLINE) { - nonnewline(p); - } else { - EMIT(OANY, 0); - } - break; - case '[': - p_bracket(p); - break; - case BACKSL | '{': - SETERROR(RZ_REGEX_BADRPT); - break; - case BACKSL | '(': - p->g->nsub++; - subno = p->g->nsub; - if (subno < NPAREN) { - p->pbegin[subno] = HERE(); - } - EMIT(OLPAREN, subno); - /* the MORE here is an error heuristic */ - if (MORE() && !SEETWO('\\', ')')) { - p_bre(p, '\\', ')'); - } - if (subno < NPAREN) { - p->pend[subno] = HERE(); - if (!p->pend[subno]) { - break; - } - } - EMIT(ORPAREN, subno); - REQUIRE(EATTWO('\\', ')'), RZ_REGEX_EPAREN); - break; - case BACKSL | ')': /* should not get here -- must be user */ - case BACKSL | '}': - SETERROR(RZ_REGEX_EPAREN); - break; - case BACKSL | '1': - case BACKSL | '2': - case BACKSL | '3': - case BACKSL | '4': - case BACKSL | '5': - case BACKSL | '6': - case BACKSL | '7': - case BACKSL | '8': - case BACKSL | '9': - i = (c & ~BACKSL) - '0'; - if (p->pend[i] != 0) { - if (i <= p->g->nsub) { - EMIT(OBACK_, i); - if (p->pbegin[i] != 0 && OP(p->strip[p->pbegin[i]]) == OLPAREN && - OP(p->strip[p->pend[i]]) == ORPAREN) { - (void)dupl(p, p->pbegin[i] + 1, p->pend[i]); - EMIT(O_BACK, i); - } - } - } else { - SETERROR(RZ_REGEX_ESUBREG); - } - p->g->backrefs = 1; - break; - case '*': - REQUIRE(starordinary, RZ_REGEX_BADRPT); - /* FALLTHROUGH */ - default: - ordinary(p, (char)c); - break; - } - - if (EAT('*')) { /* implemented as +? */ - /* this case does not require the (y|) trick, noKLUDGE */ - INSERT(OPLUS_, pos); - ASTERN(O_PLUS, pos); - INSERT(OQUEST_, pos); - ASTERN(O_QUEST, pos); - } else if (EATTWO('\\', '{')) { - count = p_count(p); - if (EAT(',')) { - if (MORE() && isdigit((ut8)PEEK())) { - count2 = p_count(p); - REQUIRE(count <= count2, RZ_REGEX_BADBR); - } else { /* single number with comma */ - count2 = INTFINITY; - } - } else { /* just a single number */ - count2 = count; - } - repeat(p, pos, count, count2); - if (!EATTWO('\\', '}')) { /* error heuristics */ - while (MORE() && !SEETWO('\\', '}')) { - NEXT(); - } - REQUIRE(MORE(), RZ_REGEX_EBRACE); - SETERROR(RZ_REGEX_BADBR); - } - } else if (c == '$') { /* $ (but not \$) ends it */ - return (1); - } - - return (0); -} - -/* - - p_count - parse a repetition count - */ -static int /* the value */ -p_count(struct parse *p) { - int count = 0; - int ndigits = 0; - - while (MORE() && isdigit((ut8)PEEK()) && count <= DUPMAX) { - count = count * 10 + (GETNEXT() - '0'); - ndigits++; - } - - REQUIRE(ndigits > 0 && count <= DUPMAX, RZ_REGEX_BADBR); - return (count); -} - -/* - - p_bracket - parse a bracketed character list - * - * Note a significant property of this code: if the allocset() did SETERROR, - * no set operations are done. - */ -static void p_bracket(struct parse *p) { - cset *cs; - int invert = 0; - - /* Dept of Truly Sickening Special-Case Kludges */ - if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { - EMIT(OBOW, 0); - NEXTn(6); - return; - } - if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { - EMIT(OEOW, 0); - NEXTn(6); - return; - } - - if (!(cs = allocset(p))) { - /* allocset did set error status in p */ - return; - } - - if (EAT('^')) { - invert++; /* make note to invert set at end */ - } - if (EAT(']')) { - CHadd(cs, ']'); - } else if (EAT('-')) { - CHadd(cs, '-'); - } - while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) { - p_b_term(p, cs); - } - if (EAT('-')) { - CHadd(cs, '-'); - } - MUSTEAT(']', RZ_REGEX_EBRACK); - - if (p->error != 0) { /* don't mess things up further */ - freeset(p, cs); - return; - } - - if (p->g->cflags & RZ_REGEX_ICASE) { - int i; - int ci; - - for (i = p->g->csetsize - 1; i >= 0; i--) { - if (CHIN(cs, i) && isalpha(i)) { - ci = othercase(i); - if (ci != i) { - CHadd(cs, ci); - } - } - } - if (cs->multis != NULL) { - mccase(p, cs); - } - } - if (invert) { - int i; - - for (i = p->g->csetsize - 1; i >= 0; i--) { - if (CHIN(cs, i)) { - CHsub(cs, i); - } else { - CHadd(cs, i); - } - } - if (p->g->cflags & RZ_REGEX_NEWLINE) { - CHsub(cs, '\n'); - } - if (cs->multis != NULL) { - mcinvert(p, cs); - } - } - - if (cs->multis) { /* xxx */ - return; - } - - if (nch(p, cs) == 1) { /* optimize singleton sets */ - ordinary(p, firstch(p, cs)); - freeset(p, cs); - } else { - EMIT(OANYOF, freezeset(p, cs)); - } -} - -/* - - p_b_term - parse one term of a bracketed character list - */ -static void p_b_term(struct parse *p, cset *cs) { - char c; - char start = 0, finish; - int i; - - /* classify what we've got */ - switch ((MORE()) ? PEEK() : '\0') { - case '[': - c = (MORE2()) ? PEEK2() : '\0'; - break; - case '-': - SETERROR(RZ_REGEX_ERANGE); - return; /* NOTE RETURN */ - break; - default: - c = '\0'; - break; - } - - switch (c) { - case ':': /* character class */ - NEXT2(); - REQUIRE(MORE(), RZ_REGEX_EBRACK); - c = PEEK(); - REQUIRE(c != '-' && c != ']', RZ_REGEX_ECTYPE); - p_b_cclass(p, cs); - REQUIRE(MORE(), RZ_REGEX_EBRACK); - REQUIRE(EATTWO(':', ']'), RZ_REGEX_ECTYPE); - break; - case '=': /* equivalence class */ - NEXT2(); - REQUIRE(MORE(), RZ_REGEX_EBRACK); - c = PEEK(); - REQUIRE(c != '-' && c != ']', RZ_REGEX_ECOLLATE); - p_b_eclass(p, cs); - REQUIRE(MORE(), RZ_REGEX_EBRACK); - REQUIRE(EATTWO('=', ']'), RZ_REGEX_ECOLLATE); - break; - default: /* symbol, ordinary character, or range */ - /* xxx revision needed for multichar stuff */ - start = p_b_symbol(p); - if (SEE('-') && MORE2() && PEEK2() != ']') { - /* range */ - NEXT(); - if (EAT('-')) { - finish = '-'; - } else { - finish = p_b_symbol(p); - } - } else { - finish = start; - } - /* xxx what about signed chars here... */ - REQUIRE(start <= finish, RZ_REGEX_ERANGE); - for (i = start; i <= finish; i++) { - CHadd(cs, i); - } - break; - } -} - -/* - - p_b_cclass - parse a character-class name and deal with it - */ -static void p_b_cclass(struct parse *p, cset *cs) { - char *sp = p->next; - struct cclass *cp; - size_t len; - char *u; - char c; - - while (MORE() && isalpha((unsigned char)PEEK())) { - NEXT(); - } - len = p->next - sp; - for (cp = cclasses; cp->name != NULL; cp++) { - if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') { - break; - } - } - if (!cp->name) { - /* oops, didn't find it */ - SETERROR(RZ_REGEX_ECTYPE); - return; - } - - u = cp->chars; - while ((c = *u++) != '\0') { - CHadd(cs, c); - } - for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) { - MCadd(p, cs, u); - } -} - -/* - - p_b_eclass - parse an equivalence-class name and deal with it - * - * This implementation is incomplete. xxx - */ -static void p_b_eclass(struct parse *p, cset *cs) { - char c; - - c = p_b_coll_elem(p, '='); - CHadd(cs, c); -} - -/* - - p_b_symbol - parse a character or [..]ed multicharacter collating symbol - */ -static char /* value of symbol */ -p_b_symbol(struct parse *p) { - char value; - - REQUIRE(MORE(), RZ_REGEX_EBRACK); - if (!EATTWO('[', '.')) { - return (GETNEXT()); - } - - /* collating symbol */ - value = p_b_coll_elem(p, '.'); - REQUIRE(EATTWO('.', ']'), RZ_REGEX_ECOLLATE); - return (value); -} - -/* - - p_b_coll_elem - parse a collating-element name and look it up - */ -static char /* value of collating element */ -p_b_coll_elem(struct parse *p, - int endc) /* name ended by endc,']' */ -{ - char *sp = p->next; - struct cname *cp; - int len; - - while (MORE() && !SEETWO(endc, ']')) { - NEXT(); - } - if (!MORE()) { - SETERROR(RZ_REGEX_EBRACK); - return (0); - } - len = p->next - sp; - for (cp = cnames; cp->name != NULL; cp++) { - if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') { - return (cp->code); /* known name */ - } - } - if (len == 1) { - return (*sp); /* single character */ - } - SETERROR(RZ_REGEX_ECOLLATE); /* neither */ - return (0); -} - -/* - - othercase - return the case counterpart of an alphabetic - */ -static char /* if no counterpart, return ch */ -othercase(int ch) { - ch = (ut8)ch; - if (isalpha(ch)) { - if (isupper(ch)) { - return ((ut8)tolower(ch)); - } else if (islower(ch)) { - return ((ut8)toupper(ch)); - } else { /* peculiar, but could happen */ - return (ch); - } - } - return ch; -} - -/* - - bothcases - emit a dualcase version of a two-case character - * - * Boy, is this implementation ever a kludge... - */ -static void bothcases(struct parse *p, int ch) { - char *oldnext = p->next; - char *oldend = p->end; - char bracket[3]; - - ch = (ut8)ch; - if (othercase(ch) != ch) { /* p_bracket() would recurse */ - p->next = bracket; - p->end = bracket + 2; - bracket[0] = ch; - bracket[1] = ']'; - bracket[2] = '\0'; - p_bracket(p); - if (p->next == bracket + 2) { - p->next = oldnext; - p->end = oldend; - } - } -} - -/* - - ordinary - emit an ordinary character - */ -static void -ordinary(struct parse *p, int ch) { - cat_t *cap = p->g->categories; - - if ((p->g->cflags & RZ_REGEX_ICASE) && isalpha((ut8)ch) && othercase(ch) != ch) { - bothcases(p, ch); - } else { - EMIT(OCHAR, (ut8)ch); - if (cap[ch] == 0) { - cap[ch] = p->g->ncategories++; - } - } -} - -static void -special(struct parse *p, int ch) { - char *oldnext = p->next; - char *oldend = p->end; - char bracket[16] = { 0 }; - char digits[3] = { 0 }; - char c; - int num = 0; - switch (ch) { - case 'x': - digits[0] = GETNEXT(); - digits[1] = GETNEXT(); - c = (char)strtol(digits, NULL, 16); - ordinary(p, c); - return; - case 'n': - ordinary(p, '\n'); - return; - case 't': - ordinary(p, '\t'); - return; - case 'r': - ordinary(p, '\r'); - return; - case 's': - case 'S': - num = 6; - const char *chars = "^\t\r\n ]"; - if (ch == 's') { - num--; - chars++; - } - memcpy(bracket, chars, num); - break; - case 'd': - num = 4; - memcpy(bracket, "0-9]", num); - break; - case 'w': - num = 4; - memcpy(bracket, "a-z]", num); - break; - default: - SETERROR(RZ_REGEX_INVARG); - return; - } - - p->next = bracket; - p->end = bracket + num; - - p_bracket(p); - - if (p->next == bracket + num) { - p->next = oldnext; - p->end = oldend; - } -} - -/* - - nonnewline - emit RZ_REGEX_NEWLINE version of OANY - * - * Boy, is this implementation ever a kludge... - */ -static void -nonnewline(struct parse *p) { - char *oldnext = p->next; - char *oldend = p->end; - char bracket[4]; - - p->next = bracket; - p->end = bracket + 3; - bracket[0] = '^'; - bracket[1] = '\n'; - bracket[2] = ']'; - bracket[3] = '\0'; - p_bracket(p); - if (p->next == bracket + 3) { - p->next = oldnext; - p->end = oldend; - } -} - -/* - - repeat - generate code for a bounded repetition, recursively if needed - */ -static void -repeat(struct parse *p, - sopno start, /* operand from here to end of strip */ - int from, /* repeated from this number */ - int to) /* to this number of times (maybe INTFINITY) */ -{ - sopno finish = HERE(); -#define N 2 -#define INF 3 -#define REP(f, t) ((f)*8 + (t)) -#define MAP(n) (((n) <= 1) ? (n) : ((n) == INTFINITY) ? INF \ - : N) - sopno copy; - - if (p->error != 0) { /* head off possible runaway recursion */ - return; - } - - if (from > to) { - return; - } - - switch (REP(MAP(from), MAP(to))) { - case REP(0, 0): /* must be user doing this */ - DROP(finish - start); /* drop the operand */ - break; - case REP(0, 1): /* as x{1,1}? */ - case REP(0, N): /* as x{1,n}? */ - case REP(0, INF): /* as x{1,}? */ - /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ - INSERT(OCH_, start); /* offset is wrong... */ - repeat(p, start + 1, 1, to); - ASTERN(OOR1, start); - AHEAD(start); /* ... fix it */ - EMIT(OOR2, 0); - AHEAD(THERE()); - ASTERN(O_CH, THERETHERE()); - break; - case REP(1, 1): /* trivial case */ - /* done */ - break; - case REP(1, N): /* as x?x{1,n-1} */ - /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ - INSERT(OCH_, start); - ASTERN(OOR1, start); - AHEAD(start); - EMIT(OOR2, 0); /* offset very wrong... */ - AHEAD(THERE()); /* ...so fix it */ - ASTERN(O_CH, THERETHERE()); - copy = dupl(p, start + 1, finish + 1); - if (copy == finish + 4) { - repeat(p, copy, 1, to - 1); - } - break; - case REP(1, INF): /* as x+ */ - INSERT(OPLUS_, start); - ASTERN(O_PLUS, start); - break; - case REP(N, N): /* as xx{m-1,n-1} */ - copy = dupl(p, start, finish); - repeat(p, copy, from - 1, to - 1); - break; - case REP(N, INF): /* as xx{n-1,INF} */ - copy = dupl(p, start, finish); - repeat(p, copy, from - 1, to); - break; - default: /* "can't happen" */ - SETERROR(RZ_REGEX_ASSERT); /* just in case */ - break; - } -} - -/* - - seterr - set an error condition - */ -static int /* useless but makes type checking happy */ -seterr(struct parse *p, int e) { - if (p->error == 0) { /* keep earliest error condition */ - p->error = e; - } - p->next = nuls; /* try to bring things to a halt */ - p->end = nuls; - return (0); /* make the return value well-defined */ -} - -/* - - allocset - allocate a set of characters for [] - */ -static cset *allocset(struct parse *p) { - int no = p->g->ncsets++; - size_t nc; - size_t nbytes; - cset *cs; - size_t css = (size_t)p->g->csetsize; - int i; - - if (no >= p->ncsalloc) { /* need another column of space */ - void *ptr; - - p->ncsalloc += CHAR_BIT; - nc = p->ncsalloc; - if (nc % CHAR_BIT) { - goto nomem; - } - nbytes = nc / CHAR_BIT * css; - - ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset)); - if (!ptr) { - goto nomem; - } - p->g->sets = ptr; - - ptr = (ut8 *)realloc((char *)p->g->setbits, nbytes); - if (!ptr) { - goto nomem; - } - p->g->setbits = ptr; - - for (i = 0; i < no; i++) { - p->g->sets[i].ptr = p->g->setbits + css * (i / CHAR_BIT); - } - - (void)memset((char *)p->g->setbits + (nbytes - css), 0, css); - } - /* XXX should not happen */ - if (!p->g->sets || !p->g->setbits) { - goto nomem; - } - - cs = &p->g->sets[no]; - cs->ptr = p->g->setbits + css * ((no) / CHAR_BIT); - cs->mask = 1 << ((no) % CHAR_BIT); - cs->hash = 0; - cs->smultis = 0; - cs->multis = NULL; - - return (cs); -nomem: - RZ_FREE(p->g->sets); - RZ_FREE(p->g->setbits); - - SETERROR(RZ_REGEX_ESPACE); - /* caller's responsibility not to do set ops */ - return (NULL); -} - -/* - - freeset - free a now-unused set - */ -static void freeset(struct parse *p, cset *cs) { - int i; - cset *top = &p->g->sets[p->g->ncsets]; - size_t css = (size_t)p->g->csetsize; - - for (i = 0; i < css; i++) { - CHsub(cs, i); - } - if (cs == top - 1) { /* recover only the easy case */ - p->g->ncsets--; - } -} - -/* - - freezeset - final processing on a set of characters - * - * The main task here is merging identical sets. This is usually a waste - * of time (although the hash code minimizes the overhead), but can win - * big if RZ_REGEX_ICASE is being used. RZ_REGEX_ICASE, by the way, is why the hash - * is done using addition rather than xor -- all ASCII [aA] sets xor to - * the same value! - */ -static int /* set number */ -freezeset(struct parse *p, cset *cs) { - ut8 h = cs->hash; - int i; - cset *top = &p->g->sets[p->g->ncsets]; - cset *cs2; - size_t css = (size_t)p->g->csetsize; - - /* look for an earlier one which is the same */ - for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) { - if (cs2->hash == h && cs2 != cs) { - /* maybe */ - for (i = 0; i < css; i++) { - if (!!CHIN(cs2, i) != !!CHIN(cs, i)) { - break; /* no */ - } - } - if (i == css) { - break; /* yes */ - } - } - } - - if (cs2 < top) { /* found one */ - freeset(p, cs); - cs = cs2; - } - - return ((int)(cs - p->g->sets)); -} - -/* - - firstch - return first character in a set (which must have at least one) - */ -static int /* character; there is no "none" value */ -firstch(struct parse *p, cset *cs) { - int i; - size_t css = (size_t)p->g->csetsize; - - for (i = 0; i < css; i++) { - if (CHIN(cs, i)) { - return ((char)i); - } - } - return (0); /* arbitrary */ -} - -/* - - nch - number of characters in a set - */ -static int nch(struct parse *p, cset *cs) { - int i; - size_t css = (size_t)p->g->csetsize; - int n = 0; - - for (i = 0; i < css; i++) { - if (CHIN(cs, i)) { - n++; - } - } - return (n); -} - -/* - - mcadd - add a collating element to a cset - */ -static void mcadd(struct parse *p, cset *cs, char *cp) { - size_t oldend = cs->smultis; - void *np; - - cs->smultis += strlen(cp) + 1; - np = realloc(cs->multis, cs->smultis); - if (!np) { - if (cs->multis) { - free(cs->multis); - } - cs->multis = NULL; - SETERROR(RZ_REGEX_ESPACE); - return; - } - cs->multis = np; - - STRLCPY(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1); -} - -/* - - mcinvert - invert the list of collating elements in a cset - * - * This would have to know the set of possibilities. Implementation - * is deferred. - */ -/* ARGSUSED */ -static void mcinvert(struct parse *p, cset *cs) { - // asert(!cs->multis); /* xxx */ - return; -} - -/* - - mccase - add case counterparts of the list of collating elements in a cset - * - * This would have to know the set of possibilities. Implementation - * is deferred. - */ -/* ARGSUSED */ -static void mccase(struct parse *p, cset *cs) { - // asert(!cs->multis); /* xxx */ - return; -} - -/* - - isinsets - is this character in any sets? - */ -static int /* predicate */ -isinsets(struct re_guts *g, int c) { - ut8 *col; - int i; - int ncols = (g->ncsets + (CHAR_BIT - 1)) / CHAR_BIT; - unsigned uc = (ut8)c; - - for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) { - if (col[uc] != 0) { - return (1); - } - } - return (0); -} - -/* - - samesets - are these two characters in exactly the same sets? - */ -static int /* predicate */ -samesets(struct re_guts *g, int c1, int c2) { - ut8 *col; - int i; - int ncols = (g->ncsets + (CHAR_BIT - 1)) / CHAR_BIT; - unsigned uc1 = (ut8)c1; - unsigned uc2 = (ut8)c2; - - for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) { - if (col[uc1] != col[uc2]) { - return (0); - } - } - return (1); -} - -/* - - categorize - sort out character categories - */ -static void -categorize(struct parse *p, struct re_guts *g) { - cat_t *cats = g ? g->categories : NULL; - int c; - int c2; - cat_t cat; - - /* avoid making error situations worse */ - if (!p || p->error != 0 || !cats) { - return; - } - - for (c = CHAR_MIN; c <= CHAR_MAX; c++) { - if (*(cats + c) && isinsets(g, c)) { - cat = g->ncategories++; - cats[c] = cat; - for (c2 = c + 1; c2 <= CHAR_MAX; c2++) { - if (cats[c2] == 0 && samesets(g, c, c2)) { - cats[c2] = cat; - } - } - } - } -} - -/* - - dupl - emit a duplicate of a bunch of sops - */ -static sopno /* start of duplicate */ -dupl(struct parse *p, - sopno start, /* from here */ - sopno finish) /* to this less one */ -{ - sopno ret = HERE(); - sopno len = finish - start; - - if (finish >= start) { - if (len == 0) { - return (ret); - } - enlarge(p, p->ssize + len); /* this many unexpected additions */ - if (p->ssize >= p->slen + len) { - (void)memcpy((char *)(p->strip + p->slen), - (char *)(p->strip + start), (size_t)len * sizeof(sop)); - p->slen += len; - return (ret); - } - } - return ret; -} - -/* - - doemit - emit a strip operator - * - * It might seem better to implement this as a macro with a function as - * hard-case backup, but it's just too big and messy unless there are - * some changes to the data structures. Maybe later. - */ -static void -doemit(struct parse *p, sop op, size_t opnd) { - /* avoid making error situations worse */ - if (p->error != 0) { - return; - } - - /* deal with oversize operands ("can't happen", more or less) */ - if (opnd < 1 << OPSHIFT) { - - /* deal with undersized strip */ - if (p->slen >= p->ssize) { - enlarge(p, (p->ssize + 1) / 2 * 3); /* +50% */ - } - if (p->slen < p->ssize) { - /* finally, it's all reduced to the easy case */ - p->strip[p->slen++] = SOP(op, opnd); - } - } -} - -/* - - doinsert - insert a sop into the strip - */ -static void -doinsert(struct parse *p, sop op, size_t opnd, sopno pos) { - sopno sn; - sop s; - int i; - - /* avoid making error situations worse */ - if (p->error != 0) { - return; - } - - sn = HERE(); - EMIT(op, opnd); /* do checks, ensure space */ - if (HERE() != sn + 1) { - return; - } - s = p->strip[sn]; - - /* adjust paren pointers */ - if (pos > 0) { - for (i = 1; i < NPAREN; i++) { - if (p->pbegin[i] >= pos) { - p->pbegin[i]++; - } - if (p->pend[i] >= pos) { - p->pend[i]++; - } - } - } - - memmove((char *)&p->strip[pos + 1], (char *)&p->strip[pos], - (HERE() - pos - 1) * sizeof(sop)); - p->strip[pos] = s; -} - -/* - - dofwd - complete a forward reference - */ -static void -dofwd(struct parse *p, sopno pos, sop value) { - /* avoid making error situations worse */ - if (p->error != 0) { - return; - } - - if (value < 1 << OPSHIFT) { - p->strip[pos] = OP(p->strip[pos]) | value; - } -} - -/* - - enlarge - enlarge the strip - */ -static void -enlarge(struct parse *p, sopno size) { - sop *sp; - - if (p->ssize >= size) { - return; - } - - sp = (sop *)realloc(p->strip, size * sizeof(sop)); - if (!sp) { - SETERROR(RZ_REGEX_ESPACE); - return; - } - p->strip = sp; - p->ssize = size; -} - -/* - - stripsnug - compact the strip - */ -static void -stripsnug(struct parse *p, struct re_guts *g) { - g->nstates = p->slen; - g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); - if (!g->strip) { - SETERROR(RZ_REGEX_ESPACE); - g->strip = p->strip; - } -} - -/* - - findmust - fill in must and mlen with longest mandatory literal string - * - * This algorithm could do fancy things like analyzing the operands of | - * for common subsequences. Someday. This code is simple and finds most - * of the interesting cases. - * - * Note that must and mlen got initialized during setup. - */ -static void -findmust(struct parse *p, struct re_guts *g) { - sop *scan; - sop *start = NULL; /* start initialized in the default case, after that */ - sop *newstart = NULL; /* newstart was initialized in the OCHAR case */ - sopno newlen; - sop s; - char *cp; - sopno i; - - /* avoid making error situations worse */ - if (p->error != 0) { - return; - } - - /* find the longest OCHAR sequence in strip */ - newlen = 0; - start = scan = g->strip + 1; - do { - s = *scan++; - switch (OP(s)) { - case OCHAR: /* sequence member */ - if (newlen == 0) { /* new sequence */ - newstart = scan - 1; - } - newlen++; - break; - case OPLUS_: /* things that don't break one */ - case OLPAREN: - case ORPAREN: - break; - case OQUEST_: /* things that must be skipped */ - case OCH_: - scan--; - do { - scan += OPND(s); - s = *scan; - /* asert() interferes w debug printouts */ - if (OP(s) != O_QUEST && OP(s) != O_CH && - OP(s) != OOR2) { - g->iflags |= BAD; - return; - } - } while (OP(s) != O_QUEST && OP(s) != O_CH); - /* fallthrough */ - default: /* things that break a sequence */ - if (newlen > g->mlen) { /* ends one */ - start = newstart; - g->mlen = newlen; - } - newlen = 0; - break; - } - } while (OP(s) != OEND); - - if (g->mlen == 0) { /* there isn't one */ - return; - } - - /* turn it into a character string */ - g->must = malloc((size_t)g->mlen + 1); - if (!g->must) { /* argh; just forget it */ - g->mlen = 0; - return; - } - cp = g->must; - scan = start; - for (i = g->mlen; i > 0; i--) { - while (OP(s = *scan++) != OCHAR) { - continue; - } - if (cp < g->must + g->mlen) { - *cp++ = (char)OPND(s); - } - } - if (cp == g->must + g->mlen) { - *cp++ = '\0'; /* just on general principles */ - } -} - -/* - - pluscount - count + nesting - */ -static sopno /* nesting depth */ -pluscount(struct parse *p, struct re_guts *g) { - sop *scan; - sop s; - sopno plusnest = 0; - sopno maxnest = 0; - - if (p->error != 0) { - return (0); /* there may not be an OEND */ - } - - scan = g->strip + 1; - do { - s = *scan++; - switch (OP(s)) { - case OPLUS_: - plusnest++; - break; - case O_PLUS: - if (plusnest > maxnest) { - maxnest = plusnest; - } - plusnest--; - break; - } - } while (OP(s) != OEND); - if (plusnest != 0) { - g->iflags |= BAD; - } - return (maxnest); -} diff --git a/librz/util/regex/regerror.c b/librz/util/regex/regerror.c deleted file mode 100644 index 069115b7e6b..00000000000 --- a/librz/util/regex/regerror.c +++ /dev/null @@ -1,132 +0,0 @@ -/* $OpenBSD: regerror.c,v 1.13 2005/08/05 13:03:00 espie Exp $ */ -/*- - * Copyright (c) 1992, 1993, 1994 Henry Spencer. - * Copyright (c) 1992, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Henry Spencer. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)regerror.c 8.4 (Berkeley) 3/20/94 - */ - -#include -#include -#include -#include -#include -#include -#include "rz_regex.h" - -#include "utils.h" - -static char *regatoi(const RzRegex *, char *, int); - -static struct rerr { - int code; - char *name; - char *explain; -} rerrs[] = { - { RZ_REGEX_NOMATCH, "RZ_REGEX_NOMATCH", "regexec() failed to match" }, - { RZ_REGEX_BADPAT, "RZ_REGEX_BADPAT", "invalid regular expression" }, - { RZ_REGEX_ECOLLATE, "RZ_REGEX_ECOLLATE", "invalid collating element" }, - { RZ_REGEX_ECTYPE, "RZ_REGEX_ECTYPE", "invalid character class" }, - { RZ_REGEX_EESCAPE, "RZ_REGEX_EESCAPE", "trailing backslash (\\)" }, - { RZ_REGEX_ESUBREG, "RZ_REGEX_ESUBREG", "invalid backreference number" }, - { RZ_REGEX_EBRACK, "RZ_REGEX_EBRACK", "brackets ([ ]) not balanced" }, - { RZ_REGEX_EPAREN, "RZ_REGEX_EPAREN", "parentheses not balanced" }, - { RZ_REGEX_EBRACE, "RZ_REGEX_EBRACE", "braces not balanced" }, - { RZ_REGEX_BADBR, "RZ_REGEX_BADBR", "invalid repetition count(s)" }, - { RZ_REGEX_ERANGE, "RZ_REGEX_ERANGE", "invalid character range" }, - { RZ_REGEX_ESPACE, "RZ_REGEX_ESPACE", "out of memory" }, - { RZ_REGEX_BADRPT, "RZ_REGEX_BADRPT", "repetition-operator operand invalid" }, - { RZ_REGEX_EMPTY, "RZ_REGEX_EMPTY", "empty (sub)expression" }, - { RZ_REGEX_ASSERT, "RZ_REGEX_ASSERT", "\"can't happen\" -- you found a bug" }, - { RZ_REGEX_INVARG, "RZ_REGEX_INVARG", "invalid argument to regex routine" }, - { 0, "", "*** unknown regexp error code ***" } -}; - -/* - - regerror - the interface to error numbers - = extern size_t regerror(int, const regex_t *, char *, size_t); - */ -/* ARGSUSED */ -RZ_API size_t rz_regex_error(int errcode, const RzRegex *preg, char *errbuf, size_t errbuf_size) { - struct rerr *r; - size_t len; - int target = errcode & ~RZ_REGEX_ITOA; - char *s; - char convbuf[50]; - - if (errcode == RZ_REGEX_ATOI) { - s = regatoi(preg, convbuf, sizeof convbuf); - } else { - for (r = rerrs; r->code != 0; r++) { - if (r->code == target) { - break; - } - } - - if (errcode & RZ_REGEX_ITOA) { - if (r->code != 0) { - STRLCPY(convbuf, r->name, sizeof(convbuf) - 1); - } else { - snprintf(convbuf, sizeof convbuf, "RZ_REGEX_0x%x", target); - } - s = convbuf; - } else { - s = r->explain; - } - } - - len = strlen(s) + 1; - if (errbuf_size > 0) { - STRLCPY(errbuf, s, errbuf_size - 1); - } - - return len; -} - -/* - - regatoi - internal routine to implement RZ_REGEX_ATOI - */ -static char * -regatoi(const RzRegex *preg, char *localbuf, int localbufsize) { - struct rerr *r; - - for (r = rerrs; r->code != 0; r++) { - if (strcmp(r->name, preg->re_endp) == 0) { - break; - } - } - if (r->code == 0) { - return ("0"); - } - - (void)snprintf(localbuf, localbufsize, "%d", r->code); - return (localbuf); -} diff --git a/librz/util/regex/regex.3 b/librz/util/regex/regex.3 deleted file mode 100644 index c851b8d8e9c..00000000000 --- a/librz/util/regex/regex.3 +++ /dev/null @@ -1,667 +0,0 @@ -.\" $OpenBSD: regex.3,v 1.21 2007/05/31 19:19:30 jmc Exp $ -.\" -.\" Copyright (c) 1997, Phillip F Knaack. All rights reserved. -.\" -.\" Copyright (c) 1992, 1993, 1994 Henry Spencer. -.\" Copyright (c) 1992, 1993, 1994 -.\" The Regents of the University of California. All rights reserved. -.\" -.\" This code is derived from software contributed to Berkeley by -.\" Henry Spencer. -.\" -.\" Redistribution and use in source and binary forms, with or without -.\" modification, are permitted provided that the following conditions -.\" are met: -.\" 1. Redistributions of source code must retain the above copyright -.\" notice, this list of conditions and the following disclaimer. -.\" 2. Redistributions in binary form must reproduce the above copyright -.\" notice, this list of conditions and the following disclaimer in the -.\" documentation and/or other materials provided with the distribution. -.\" 3. Neither the name of the University nor the names of its contributors -.\" may be used to endorse or promote products derived from this software -.\" without specific prior written permission. -.\" -.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND -.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -.\" SUCH DAMAGE. -.\" -.\" @(#)regex.3 8.4 (Berkeley) 3/20/94 -.\" -.Dd $Mdocdate: May 31 2007 $ -.Dt REGEX 3 -.Os -.Sh NAME -.Nm regcomp , -.Nm regexec , -.Nm regerror , -.Nm regfree -.Nd regular expression routines -.Sh SYNOPSIS -.Fd #include -.Fd #include -.Ft int -.Fn regcomp "regex_t *preg" "const char *pattern" "int cflags" -.Pp -.Ft int -.Fn regexec "const regex_t *preg" "const char *string" "size_t nmatch" \ - "regmatch_t pmatch[]" "int eflags" -.Pp -.Ft size_t -.Fn regerror "int errcode" "const regex_t *preg" "char *errbuf" \ - "size_t errbuf_size" -.Pp -.Ft void -.Fn regfree "regex_t *preg" -.Sh DESCRIPTION -These routines implement -.St -p1003.2 -regular expressions -.Pq Dq REs ; -see -.Xr re_format 7 . -.Fn regcomp -compiles an RE written as a string into an internal form, -.Fn regexec -matches that internal form against a string and reports results, -.Fn regerror -transforms error codes from either into human-readable messages, and -.Fn regfree -frees any dynamically allocated storage used by the internal form -of an RE. -.Pp -The header -.Aq Pa regex.h -declares two structure types, -.Li regex_t -and -.Li regmatch_t , -the former for compiled internal forms and the latter for match reporting. -It also declares the four functions, -a type -.Li regoff_t , -and a number of constants with names starting with -.Dv REG_ . -.Pp -.Fn regcomp -compiles the regular expression contained in the -.Fa pattern -string, -subject to the flags in -.Fa cflags , -and places the results in the -.Li regex_t -structure pointed to by -.Fa preg . -.Fa cflags -is the bitwise -.Tn OR -of zero or more of the following flags: -.Bl -tag -width XREG_EXTENDEDX -.It Dv REG_EXTENDED -Compile modern -.Pq Dq extended -REs, -rather than the obsolete -.Pq Dq basic -REs that are the default. -.It Dv REG_BASIC -This is a synonym for 0, -provided as a counterpart to -.Dv REG_EXTENDED -to improve readability. -.It Dv REG_NOSPEC -Compile with recognition of all special characters turned off. -All characters are thus considered ordinary, -so the RE is a literal string. -This is an extension, -compatible with but not specified by -.St -p1003.2 , -and should be used with -caution in software intended to be portable to other systems. -.Dv REG_EXTENDED -and -.Dv REG_NOSPEC -may not be used in the same call to -.Fn regcomp . -.It Dv REG_ICASE -Compile for matching that ignores upper/lower case distinctions. -See -.Xr re_format 7 . -.It Dv REG_NOSUB -Compile for matching that need only report success or failure, -not what was matched. -.It Dv REG_NEWLINE -Compile for newline-sensitive matching. -By default, newline is a completely ordinary character with no special -meaning in either REs or strings. -With this flag, -.Ql \&[^ -bracket expressions and -.Ql \&. -never match newline, -a -.Ql ^ -anchor matches the null string after any newline in the string -in addition to its normal function, -and the -.Ql $ -anchor matches the null string before any newline in the -string in addition to its normal function. -.It Dv REG_PEND -The regular expression ends, -not at the first NUL, -but just before the character pointed to by the -.Fa re_endp -member of the structure pointed to by -.Fa preg . -The -.Fa re_endp -member is of type -.Fa const\ char\ * . -This flag permits inclusion of NULs in the RE; -they are considered ordinary characters. -This is an extension, -compatible with but not specified by -.St -p1003.2 , -and should be used with -caution in software intended to be portable to other systems. -.El -.Pp -When successful, -.Fn regcomp -returns 0 and fills in the structure pointed to by -.Fa preg . -One member of that structure -(other than -.Fa re_endp ) -is publicized: -.Fa re_nsub , -of type -.Fa size_t , -contains the number of parenthesized subexpressions within the RE -(except that the value of this member is undefined if the -.Dv REG_NOSUB -flag was used). -If -.Fn regcomp -fails, it returns a non-zero error code; -see DIAGNOSTICS. -.Pp -.Fn regexec -matches the compiled RE pointed to by -.Fa preg -against the -.Fa string , -subject to the flags in -.Fa eflags , -and reports results using -.Fa nmatch , -.Fa pmatch , -and the returned value. -The RE must have been compiled by a previous invocation of -.Fn regcomp . -The compiled form is not altered during execution of -.Fn regexec , -so a single compiled RE can be used simultaneously by multiple threads. -.Pp -By default, -the NUL-terminated string pointed to by -.Fa string -is considered to be the text of an entire line, minus any terminating -newline. -The -.Fa eflags -argument is the bitwise -.Tn OR -of zero or more of the following flags: -.Bl -tag -width XREG_STARTENDX -.It Dv REG_NOTBOL -The first character of -the string -is not the beginning of a line, so the -.Ql ^ -anchor should not match before it. -This does not affect the behavior of newlines under -.Dv REG_NEWLINE . -.It Dv REG_NOTEOL -The NUL terminating -the string -does not end a line, so the -.Ql $ -anchor should not match before it. -This does not affect the behavior of newlines under -.Dv REG_NEWLINE . -.It Dv REG_STARTEND -The string is considered to start at -\fIstring\fR\ + \fIpmatch\fR[0].\fIrm_so\fR -and to have a terminating NUL located at -\fIstring\fR\ + \fIpmatch\fR[0].\fIrm_eo\fR -(there need not actually be a NUL at that location), -regardless of the value of -.Fa nmatch . -See below for the definition of -.Fa pmatch -and -.Fa nmatch . -This is an extension, -compatible with but not specified by -.St -p1003.2 , -and should be used with -caution in software intended to be portable to other systems. -Note that a non-zero \fIrm_so\fR does not imply -.Dv REG_NOTBOL ; -.Dv REG_STARTEND -affects only the location of the string, -not how it is matched. -.El -.Pp -See -.Xr re_format 7 -for a discussion of what is matched in situations where an RE or a -portion thereof could match any of several substrings of -.Fa string . -.Pp -Normally, -.Fn regexec -returns 0 for success and the non-zero code -.Dv REG_NOMATCH -for failure. -Other non-zero error codes may be returned in exceptional situations; -see DIAGNOSTICS. -.Pp -If -.Dv REG_NOSUB -was specified in the compilation of the RE, -or if -.Fa nmatch -is 0, -.Fn regexec -ignores the -.Fa pmatch -argument (but see below for the case where -.Dv REG_STARTEND -is specified). -Otherwise, -.Fa pmatch -points to an array of -.Fa nmatch -structures of type -.Li regmatch_t . -Such a structure has at least the members -.Fa rm_so -and -.Fa rm_eo , -both of type -.Fa regoff_t -(a signed arithmetic type at least as large as an -.Li off_t -and a -.Li ssize_t ) , -containing respectively the offset of the first character of a substring -and the offset of the first character after the end of the substring. -Offsets are measured from the beginning of the -.Fa string -argument given to -.Fn regexec . -An empty substring is denoted by equal offsets, -both indicating the character following the empty substring. -.Pp -The 0th member of the -.Fa pmatch -array is filled in to indicate what substring of -.Fa string -was matched by the entire RE. -Remaining members report what substring was matched by parenthesized -subexpressions within the RE; -member -.Va i -reports subexpression -.Va i , -with subexpressions counted (starting at 1) by the order of their opening -parentheses in the RE, left to right. -Unused entries in the array\(emcorresponding either to subexpressions that -did not participate in the match at all, or to subexpressions that do not -exist in the RE (that is, \fIi\fR\ > \fIpreg\fR\->\fIre_nsub\fR)\(emhave both -.Fa rm_so -and -.Fa rm_eo -set to \-1. -If a subexpression participated in the match several times, -the reported substring is the last one it matched. -(Note, as an example in particular, that when the RE -.Dq (b*)+ -matches -.Dq bbb , -the parenthesized subexpression matches each of the three -.Sq b Ns s -and then -an infinite number of empty strings following the last -.Sq b , -so the reported substring is one of the empties.) -.Pp -If -.Dv REG_STARTEND -is specified, -.Fa pmatch -must point to at least one -.Li regmatch_t -(even if -.Fa nmatch -is 0 or -.Dv REG_NOSUB -was specified), -to hold the input offsets for -.Dv REG_STARTEND . -Use for output is still entirely controlled by -.Fa nmatch ; -if -.Fa nmatch -is 0 or -.Dv REG_NOSUB -was specified, -the value of -.Fa pmatch[0] -will not be changed by a successful -.Fn regexec . -.Pp -.Fn regerror -maps a non-zero -.Va errcode -from either -.Fn regcomp -or -.Fn regexec -to a human-readable, printable message. -If -.Fa preg -is non-NULL, -the error code should have arisen from use of -the -.Li regex_t -pointed to by -.Fa preg , -and if the error code came from -.Fn regcomp , -it should have been the result from the most recent -.Fn regcomp -using that -.Li regex_t . -.Pf ( Fn regerror -may be able to supply a more detailed message using information -from the -.Li regex_t . ) -.Fn regerror -places the NUL-terminated message into the buffer pointed to by -.Fa errbuf , -limiting the length (including the NUL) to at most -.Fa errbuf_size -bytes. -If the whole message won't fit, -as much of it as will fit before the terminating NUL is supplied. -In any case, -the returned value is the size of buffer needed to hold the whole -message (including the terminating NUL). -If -.Fa errbuf_size -is 0, -.Fa errbuf -is ignored but the return value is still correct. -.Pp -If the -.Fa errcode -given to -.Fn regerror -is first -.Tn OR Ns 'ed -with -.Dv REG_ITOA , -the -.Dq message -that results is the printable name of the error code, -e.g., -.Dq REG_NOMATCH , -rather than an explanation thereof. -If -.Fa errcode -is -.Dv REG_ATOI , -then -.Fa preg -shall be non-null and the -.Fa re_endp -member of the structure it points to -must point to the printable name of an error code; -in this case, the result in -.Fa errbuf -is the decimal digits of -the numeric value of the error code -(0 if the name is not recognized). -.Dv REG_ITOA -and -.Dv REG_ATOI -are intended primarily as debugging facilities; -they are extensions, -compatible with but not specified by -.St -p1003.2 -and should be used with -caution in software intended to be portable to other systems. -Be warned also that they are considered experimental and changes are possible. -.Pp -.Fn regfree -frees any dynamically allocated storage associated with the compiled RE -pointed to by -.Fa preg . -The remaining -.Li regex_t -is no longer a valid compiled RE -and the effect of supplying it to -.Fn regexec -or -.Fn regerror -is undefined. -.Pp -None of these functions references global variables except for tables -of constants; -all are safe for use from multiple threads if the arguments are safe. -.Sh IMPLEMENTATION CHOICES -There are a number of decisions that -.St -p1003.2 -leaves up to the implementor, -either by explicitly saying -.Dq undefined -or by virtue of them being -forbidden by the RE grammar. -This implementation treats them as follows. -.Pp -See -.Xr re_format 7 -for a discussion of the definition of case-independent matching. -.Pp -There is no particular limit on the length of REs, -except insofar as memory is limited. -Memory usage is approximately linear in RE size, and largely insensitive -to RE complexity, except for bounded repetitions. -See -.Sx BUGS -for one short RE using them -that will run almost any system out of memory. -.Pp -A backslashed character other than one specifically given a magic meaning -by -.St -p1003.2 -(such magic meanings occur only in obsolete REs) -is taken as an ordinary character. -.Pp -Any unmatched -.Ql \&[ -is a -.Dv REG_EBRACK -error. -.Pp -Equivalence classes cannot begin or end bracket-expression ranges. -The endpoint of one range cannot begin another. -.Pp -RE_DUP_MAX, the limit on repetition counts in bounded repetitions, is 255. -.Pp -A repetition operator (?, *, +, or bounds) cannot follow another -repetition operator. -A repetition operator cannot begin an expression or subexpression -or follow -.Ql ^ -or -.Ql | . -.Pp -A -.Ql | -cannot appear first or last in a (sub)expression, or after another -.Ql | , -i.e., an operand of -.Ql | -cannot be an empty subexpression. -An empty parenthesized subexpression, -.Ql \&(\&) , -is legal and matches an -empty (sub)string. -An empty string is not a legal RE. -.Pp -A -.Ql { -followed by a digit is considered the beginning of bounds for a -bounded repetition, which must then follow the syntax for bounds. -A -.Ql { -.Em not -followed by a digit is considered an ordinary character. -.Pp -.Ql ^ -and -.Ql $ -beginning and ending subexpressions in obsolete -.Pq Dq basic -REs are anchors, not ordinary characters. -.Sh DIAGNOSTICS -Non-zero error codes from -.Fn regcomp -and -.Fn regexec -include the following: -.Pp -.Bl -tag -compact -width XREG_ECOLLATEX -.It Er REG_NOMATCH -regexec() failed to match -.It Er REG_BADPAT -invalid regular expression -.It Er REG_ECOLLATE -invalid collating element -.It Er REG_ECTYPE -invalid character class -.It Er REG_EESCAPE -\e applied to unescapable character -.It Er REG_ESUBREG -invalid backreference number -.It Er REG_EBRACK -brackets [ ] not balanced -.It Er REG_EPAREN -parentheses ( ) not balanced -.It Er REG_EBRACE -braces { } not balanced -.It Er REG_BADBR -invalid repetition count(s) in { } -.It Er REG_ERANGE -invalid character range in [ ] -.It Er REG_ESPACE -ran out of memory -.It Er REG_BADRPT -?, *, or + operand invalid -.It Er REG_EMPTY -empty (sub)expression -.It Er REG_ASSERT -.Dq can't happen -\(emyou found a bug -.It Er REG_INVARG -invalid argument, e.g., negative-length string -.El -.Sh SEE ALSO -.Xr grep 1 , -.Xr re_format 7 -.Pp -.St -p1003.2 , -sections 2.8 (Regular Expression Notation) -and -B.5 (C Binding for Regular Expression Matching). -.Sh HISTORY -Originally written by Henry Spencer. -Altered for inclusion in the -.Bx 4.4 -distribution. -.Sh BUGS -This is an alpha release with known defects. -Please report problems. -.Pp -There is one known functionality bug. -The implementation of internationalization is incomplete: -the locale is always assumed to be the default one of -.St -p1003.2 , -and only the collating elements etc. of that locale are available. -.Pp -The back-reference code is subtle and doubts linger about its correctness -in complex cases. -.Pp -.Fn regexec -performance is poor. -This will improve with later releases. -.Fa nmatch -exceeding 0 is expensive; -.Fa nmatch -exceeding 1 is worse. -.Fn regexec -is largely insensitive to RE complexity -.Em except -that back references are massively expensive. -RE length does matter; in particular, there is a strong speed bonus -for keeping RE length under about 30 characters, -with most special characters counting roughly double. -.Pp -.Fn regcomp -implements bounded repetitions by macro expansion, -which is costly in time and space if counts are large -or bounded repetitions are nested. -A RE like, say, -.Dq ((((a{1,100}){1,100}){1,100}){1,100}){1,100} -will (eventually) run almost any existing machine out of swap space. -.Pp -There are suspected problems with response to obscure error conditions. -Notably, -certain kinds of internal overflow, -produced only by truly enormous REs or by multiply nested bounded repetitions, -are probably not handled well. -.Pp -Due to a mistake in -.St -p1003.2 , -things like -.Ql a)b -are legal REs because -.Ql \&) -is -a special character only in the presence of a previous unmatched -.Ql \&( . -This can't be fixed until the spec is fixed. -.Pp -The standard's definition of back references is vague. -For example, does -.Dq a\e(\e(b\e)*\e2\e)*d -match -.Dq abbbd ? -Until the standard is clarified, -behavior in such cases should not be relied on. -.Pp -The implementation of word-boundary matching is a bit of a kludge, -and bugs may lurk in combinations of word-boundary matching and anchoring. diff --git a/librz/util/regex/regex2.h b/librz/util/regex/regex2.h deleted file mode 100644 index cb84e2e3306..00000000000 --- a/librz/util/regex/regex2.h +++ /dev/null @@ -1,158 +0,0 @@ -/* $OpenBSD: regex2.h,v 1.7 2004/11/30 17:04:23 otto Exp $ */ - -/*- - * Copyright (c) 1992, 1993, 1994 Henry Spencer. - * Copyright (c) 1992, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Henry Spencer. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)regex2.h 8.4 (Berkeley) 3/20/94 - */ - -/* - * internals of regex_t - */ -#define MAGIC1 ((('r' ^ 0200) << 8) | 'e') - -/* - * The internal representation is a *strip*, a sequence of - * operators ending with an endmarker. (Some terminology etc. is a - * historical relic of earlier versions which used multiple strips.) - * Certain oddities in the representation are there to permit running - * the machinery backwards; in particular, any deviation from sequential - * flow must be marked at both its source and its destination. Some - * fine points: - * - * - OPLUS_ and O_PLUS are *inside* the loop they create. - * - OQUEST_ and O_QUEST are *outside* the bypass they create. - * - OCH_ and O_CH are *outside* the multi-way branch they create, while - * OOR1 and OOR2 are respectively the end and the beginning of one of - * the branches. Note that there is an implicit OOR2 following OCH_ - * and an implicit OOR1 preceding O_CH. - * - * In state representations, an operator's bit is on to signify a state - * immediately *preceding* "execution" of that operator. - */ -typedef unsigned long sop; /* strip operator */ -typedef long sopno; -#define OPRMASK 0xf8000000LU -#define OPDMASK 0x07ffffffLU -#define OPSHIFT ((unsigned)27) -#define OP(n) ((n)&OPRMASK) -#define OPND(n) ((n)&OPDMASK) -#define SOP(op, opnd) ((op) | (opnd)) -/* operators meaning operand */ -/* (back, fwd are offsets) */ -#define OEND (1LU << OPSHIFT) /* endmarker - */ -#define OCHAR (2LU << OPSHIFT) /* character unsigned char */ -#define OBOL (3LU << OPSHIFT) /* left anchor - */ -#define OEOL (4LU << OPSHIFT) /* right anchor - */ -#define OANY (5LU << OPSHIFT) /* . - */ -#define OANYOF (6LU << OPSHIFT) /* [...] set number */ -#define OBACK_ (7LU << OPSHIFT) /* begin \d paren number */ -#define O_BACK (8LU << OPSHIFT) /* end \d paren number */ -#define OPLUS_ (9LU << OPSHIFT) /* + prefix fwd to suffix */ -#define O_PLUS (10LU << OPSHIFT) /* + suffix back to prefix */ -#define OQUEST_ (11LU << OPSHIFT) /* ? prefix fwd to suffix */ -#define O_QUEST (12LU << OPSHIFT) /* ? suffix back to prefix */ -#define OLPAREN (13LU << OPSHIFT) /* ( fwd to ) */ -#define ORPAREN (14LU << OPSHIFT) /* ) back to ( */ -#define OCH_ (15LU << OPSHIFT) /* begin choice fwd to OOR2 */ -#define OOR1 (16LU << OPSHIFT) /* | pt. 1 back to OOR1 or OCH_ */ -#define OOR2 (17LU << OPSHIFT) /* | pt. 2 fwd to OOR2 or O_CH */ -#define O_CH (18LU << OPSHIFT) /* end choice back to OOR1 */ -#define OBOW (19LU << OPSHIFT) /* begin word - */ -#define OEOW (20LU << OPSHIFT) /* end word - */ - -/* - * Structure for [] character-set representation. Character sets are - * done as bit vectors, grouped 8 to a byte vector for compactness. - * The individual set therefore has both a pointer to the byte vector - * and a mask to pick out the relevant bit of each byte. A hash code - * simplifies testing whether two sets could be identical. - * - * This will get trickier for multicharacter collating elements. As - * preliminary hooks for dealing with such things, we also carry along - * a string of multi-character elements, and decide the size of the - * vectors at run time. - */ -typedef struct { - ut8 *ptr; /* -> ut8 [csetsize] */ - ut8 mask; /* bit within array */ - ut8 hash; /* hash code */ - size_t smultis; - char *multis; /* -> char[smulti] ab\0cd\0ef\0\0 */ -} cset; -/* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */ -#define CHadd(cs, c) ((cs)->ptr[(ut8)(c)] |= (cs)->mask, (cs)->hash += (c)) -#define CHsub(cs, c) ((cs)->ptr[(ut8)(c)] &= ~(cs)->mask, (cs)->hash -= (c)) -#define CHIN(cs, c) ((cs)->ptr[(ut8)(c)] & (cs)->mask) -#define MCadd(p, cs, cp) mcadd(p, cs, cp) /* regcomp() internal fns */ -#define MCsub(p, cs, cp) mcsub(p, cs, cp) -#define MCin(p, cs, cp) mcin(p, cs, cp) - -/* stuff for character categories */ -typedef unsigned char cat_t; - -/* - * main compiled-expression structure - */ -struct re_guts { - int magic; -#define MAGIC2 ((('R' ^ 0200) << 8) | 'E') - sop *strip; /* malloced area for strip */ - int csetsize; /* number of bits in a cset vector */ - int ncsets; /* number of csets in use */ - cset *sets; /* -> cset [ncsets] */ - ut8 *setbits; /* -> ut8[csetsize][ncsets/CHAR_BIT] */ - int cflags; /* copy of regcomp() cflags argument */ - sopno nstates; /* = number of sops */ - sopno firststate; /* the initial OEND (normally 0) */ - sopno laststate; /* the final OEND */ - int iflags; /* internal flags */ -#define USEBOL 01 /* used ^ */ -#define USEEOL 02 /* used $ */ -#define BAD 04 /* something wrong */ - int nbol; /* number of ^ used */ - int neol; /* number of $ used */ - int ncategories; /* how many character categories */ - cat_t *categories; /* ->catspace[-CHAR_MIN] */ - char *must; /* match must contain this string */ - int mlen; /* length of must */ - size_t nsub; /* copy of re_nsub */ - int backrefs; /* does it use back references? */ - sopno nplus; /* how deep does it nest +s? */ - /* catspace must be last */ - cat_t catspace[1]; /* actually [NC] */ -}; - -/* misc utilities */ -#undef OUT -#define OUT (-CHAR_MIN + CHAR_MAX + 1) /* a non-character value */ -#define ISWORD(c) (isalnum((ut8)(c)) || (c) == '_') diff --git a/librz/util/regex/regexec.c b/librz/util/regex/regexec.c deleted file mode 100644 index 97ba732dc95..00000000000 --- a/librz/util/regex/regexec.c +++ /dev/null @@ -1,174 +0,0 @@ -/* $OpenBSD: regexec.c,v 1.11 2005/08/05 13:03:00 espie Exp $ */ -/*- - * Copyright (c) 1992, 1993, 1994 Henry Spencer. - * Copyright (c) 1992, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Henry Spencer. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)regexec.c 8.3 (Berkeley) 3/20/94 - */ - -/* - * the outer shell of regexec() - * - * This file includes engine.c *twice*, after muchos fiddling with the - * macros that code uses. This lets the same code operate on two different - * representations for state sets. - */ -#include -#include -#include -#include -#include -#include -#include - -#include "utils.h" -#include "regex2.h" - -/* macros for manipulating states, small version */ -#define states1 long long /* for later use in regexec() decision */ -#define states states1 -#define CLEAR(v) ((v) = 0) -#define SET0(v, n) ((v) &= ~((unsigned states)1 << (n))) -#define SET1(v, n) ((v) |= (unsigned states)1 << (n)) -#define ISSET(v, n) (((v) & ((unsigned states)1 << (n))) != 0) -#define ASSIGN(d, s) ((d) = (s)) -#define EQ(a, b) ((a) == (b)) -#define STATEVARS states dummy /* dummy version */ -#define STATESETUP(m, n) /* nothing */ -#define STATETEARDOWN(m) /* nothing */ -#define SETUP(v) ((v) = 0) -#define onestate states -#define INIT(o, n) ((o) = (unsigned states)1 << (n)) -#define INC(o) ((o) <<= 1) -#define ISSTATEIN(v, o) (((v) & (o)) != 0) -/* some abbreviations; note that some of these know variable names! */ -/* do "if I'm here, I can also be there" etc without branches */ -#define FWD(dst, src, n) ((dst) |= ((unsigned states)(src) & (here)) << (n)) -#define BACK(dst, src, n) ((dst) |= ((unsigned states)(src) & (here)) >> (n)) -#define ISSETBACK(v, n) (((v) & ((unsigned states)here >> (n))) != 0) -/* function names */ -#define SNAMES /* engine.c looks after details */ - -#include "engine.c" - -/* now undo things */ -#undef states -#undef CLEAR -#undef SET0 -#undef SET1 -#undef ISSET -#undef ASSIGN -#undef EQ -#undef STATEVARS -#undef STATESETUP -#undef STATETEARDOWN -#undef SETUP -#undef onestate -#undef INIT -#undef INC -#undef ISSTATEIN -#undef FWD -#undef BACK -#undef ISSETBACK -#undef SNAMES - -/* macros for manipulating states, large version */ -#define states char * -#define CLEAR(v) memset(v, 0, m->g->nstates) -#define SET0(v, n) ((v)[n] = 0) -#define SET1(v, n) ((v)[n] = 1) -#define ISSET(v, n) ((v)[n]) -#define ASSIGN(d, s) memcpy(d, s, m->g->nstates) -#define EQ(a, b) (memcmp(a, b, m->g->nstates) == 0) -#define STATEVARS \ - states1 vn; \ - char *space -#define STATESETUP(m, nv) \ - { \ - (m)->space = malloc((nv) * (m)->g->nstates); \ - if (!(m)->space) \ - return RZ_REGEX_ESPACE; \ - (m)->vn = 0; \ - } -#define STATETEARDOWN(m) \ - { free((m)->space); } -#define SETUP(v) ((v) = &m->space[m->vn++ * m->g->nstates]) -#define onestate states1 -#define INIT(o, n) ((o) = (n)) -#define INC(o) ((o)++) -#define ISSTATEIN(v, o) ((v)[o]) -/* some abbreviations; note that some of these know variable names! */ -/* do "if I'm here, I can also be there" etc without branches */ -#define FWD(dst, src, n) ((dst)[here + (n)] |= (src)[here]) -#define BACK(dst, src, n) ((dst)[here - (n)] |= (src)[here]) -#define ISSETBACK(v, n) ((v)[here - (n)]) -/* function names */ -#define LNAMES /* flag */ - -#include "engine.c" - -RZ_API bool rz_regex_check(const RzRegex *rr, const char *str) { - return rz_regex_exec(rr, str, 0, NULL, rr->re_flags); -} -/* - - regexec - interface for matching - * - * We put this here so we can exploit knowledge of the state representation - * when choosing which matcher to call. Also, by this point the matchers - * have been prototyped. - */ -/* 0 success, RZ_REGEX_NOMATCH failure */ -RZ_API int rz_regex_exec(const RzRegex *preg, const char *string, size_t nmatch, - RzRegexMatch pmatch[], int eflags) { - struct re_guts *g; -#ifdef REDEBUG -#define GOODFLAGS(f) (f) -#else -#define GOODFLAGS(f) ((f) & (RZ_REGEX_NOTBOL | RZ_REGEX_NOTEOL | RZ_REGEX_STARTEND | RZ_REGEX_LARGE)) -#endif - if (!preg || !string) { - return RZ_REGEX_ASSERT; - } - - g = preg->re_g; - if (preg->re_magic != MAGIC1 || g->magic != MAGIC2) { - return (RZ_REGEX_BADPAT); - } - if (g->iflags & BAD) { /* backstop for no-debug case */ - return (RZ_REGEX_BADPAT); - } - eflags = GOODFLAGS(eflags); - if (g->nstates <= CHAR_BIT * sizeof(states1) && !(eflags & RZ_REGEX_LARGE)) { - return (smatcher(g, (char *)string, nmatch, pmatch, eflags)); - } else { - return (lmatcher(g, (char *)string, nmatch, pmatch, eflags)); - } -} diff --git a/librz/util/regex/test.c b/librz/util/regex/test.c deleted file mode 100644 index 91fc75a9d44..00000000000 --- a/librz/util/regex/test.c +++ /dev/null @@ -1,55 +0,0 @@ -#include -#include - -int _main(void) { - RzRegex rx; - int rc = rz_regex_comp(&rx, "^hi", RZ_REGEX_NOSUB); - if (rc) { - printf("error\n"); - - } else { - rc = rz_regex_exec(&rx, "patata", 0, 0, 0); - printf("out = %d\n", rc); - - rc = rz_regex_exec(&rx, "hillow", 0, 0, 0); - printf("out = %d\n", rc); - } - rz_regex_free(&rx); - return 0; -} - -static void test_or(void) { - RzRegex *rx = rz_regex_new("(eax|ebx)", "e"); - printf("result (%s) = %d\n", "mov eax", rz_regex_match("(eax|ebx)", "e", "mov eax")); - printf("result (%s) = %d\n", "mov ebx", rz_regex_match("(eax|ebx)", "e", "mov ebx")); - printf("result (%s) = %d\n", "mov eax", rz_regex_match("(eax|ebx)", "e", "mov ecx")); - printf("result (%s) = %d\n", "mov ebx", rz_regex_match("(eax|ecx)", "e", "mov ebx")); - printf("result (%s) = %d\n", "mov eax", rz_regex_check(rx, "mov eax")); - printf("result (%s) = %d\n", "mov ebx", rz_regex_check(rx, "mov ebx")); - printf("result (%s) = %d\n", "mov eax", rz_regex_exec(rx, "mov eax", 0, 0, 1)); - printf("result (%s) = %d\n", "mov ebx", rz_regex_exec(rx, "mov ebx", 0, 0, 1)); - rz_regex_free(rx); -} - -int main(int argc, char **argv) { - const char *needle = "^hi"; - const char *haystack_1 = "patata"; - const char *haystack_2 = "hillow"; - if (argc > 3) { - needle = argv[1]; - haystack_1 = argv[2]; - haystack_2 = argv[3]; - } else - printf("Using default values\n"); - RzRegex *rx = rz_regex_new(needle, ""); - if (rx) { - int res = rz_regex_exec(rx, haystack_1, 0, 0, 0); - printf("result (%s) = %d\n", haystack_1, res); - res = rz_regex_exec(rx, haystack_2, 0, 0, 0); - printf("result (%s) = %d\n", haystack_2, res); - rz_regex_free(rx); - } else - printf("oops, cannot compile regexp\n"); - test_or(); - return 0; -} diff --git a/librz/util/regex/utils.h b/librz/util/regex/utils.h deleted file mode 100644 index 5bcda97a97e..00000000000 --- a/librz/util/regex/utils.h +++ /dev/null @@ -1,62 +0,0 @@ -/* $OpenBSD: utils.h,v 1.4 2003/06/02 20:18:36 millert Exp $ */ - -/*- - * Copyright (c) 1992, 1993, 1994 Henry Spencer. - * Copyright (c) 1992, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Henry Spencer. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)utils.h 8.3 (Berkeley) 3/20/94 - */ - -/* utility definitions */ -#define DUPMAX 255 -//_POSIX2_RE_DUP_MAX /* xxx is this right? */ -#define INTFINITY (DUPMAX + 1) -#define NC (CHAR_MAX - CHAR_MIN + 1) - -#define STRLCPY(x, y, z) \ - { \ - strncpy((x), (y), (z)); \ - (x)[(z) ? (z)-1 : 0] = 0; \ - } - -/* switch off assertions (if not already off) if no REDEBUG */ -#ifndef REDEBUG -#ifndef NDEBUG -#define NDEBUG /* no assertions please */ -#endif -#endif -#include - -/* for old systems with bcopy() but no memmove() */ -#ifdef USEBCOPY -#define memmove(d, s, c) bcopy(s, d, c) -#endif -#define ut8 unsigned char diff --git a/librz/util/str.c b/librz/util/str.c index 14d877b6042..f3efeb643a3 100644 --- a/librz/util/str.c +++ b/librz/util/str.c @@ -1,13 +1,14 @@ // SPDX-FileCopyrightText: 2007-2020 pancake // SPDX-License-Identifier: LGPL-3.0-only -#include +#include #include "rz_list.h" #include "rz_types.h" #include "rz_util.h" #include "rz_cons.h" #include "rz_bin.h" #include "rz_util/rz_assert.h" +#include #include #include #include @@ -1180,7 +1181,7 @@ RZ_API RZ_OWN char *rz_str_replace(RZ_OWN char *str, const char *key, const char if (!newstr) { eprintf("realloc fail\n"); RZ_FREE(str); - break; + return NULL; } str = newstr; } @@ -3409,31 +3410,35 @@ static RzList /**/ *str_split_list_common(char *str, const char *c, int static RzList /**/ *str_split_list_common_regex(RZ_BORROW char *str, RZ_BORROW RzRegex *r, int n, bool trim, bool dup) { rz_return_val_if_fail(str && r, NULL); RzList *lst = rz_list_newf(dup ? free : NULL); - RzRegexMatch m[1]; char *aux; int i = 0; int s = 0, e = 0; int j = 0; - while (rz_regex_exec(r, str + j, 1, m, 0) == 0) { + void **it; + RzPVector *matches = rz_regex_match_all(r, str, RZ_REGEX_ZERO_TERMINATED, 0, RZ_REGEX_DEFAULT); + rz_pvector_foreach (matches, it) { + RzPVector *m = (RzPVector *)*it; + RzRegexMatch *group0 = rz_pvector_head(m); if (n == i && n > 0) { break; } - s = m[0].rm_so; // Match start (inclusive) in string str + j - e = m[0].rm_eo; // Match end (exclusive) in string str + j + s = group0->start; // Match start (inclusive) in string str + j + e = group0->start + group0->len; // Match end (exclusive) in string str + j if (dup) { - aux = rz_str_ndup(str + j, s); + aux = rz_str_ndup(str + j, s - j); } else { // Overwrite split chararcters. - memset(str + j + s, 0, e - s); + memset(str + s, 0, e - s); aux = str + j; } if (trim) { rz_str_trim(aux); } rz_list_append(lst, aux); - j += e; + j = e; ++i; } + rz_pvector_free(matches); if (*(str + j) == 0 || (n == i && n > 0) || rz_list_length(lst) == 0) { // No token left. return lst; @@ -3483,7 +3488,7 @@ RZ_API RzList /**/ *rz_str_split_list(char *str, const char *c, int n) { */ RZ_API RZ_OWN RzList /**/ *rz_str_split_list_regex(RZ_NONNULL char *str, RZ_NONNULL const char *r, int n) { rz_return_val_if_fail(str && r, NULL); - RzRegex *regex = rz_regex_new(r, "e"); + RzRegex *regex = rz_regex_new(r, RZ_REGEX_EXTENDED, 0); RzList *res = str_split_list_common_regex(str, regex, n, false, false); rz_regex_free(regex); return res; @@ -3545,7 +3550,7 @@ RZ_API RzList /**/ *rz_str_split_duplist_n(const char *_str, const char RZ_API RZ_OWN RzList /**/ *rz_str_split_duplist_n_regex(RZ_NONNULL const char *_str, RZ_NONNULL const char *r, int n, bool trim) { rz_return_val_if_fail(_str && r, NULL); char *str = strdup(_str); - RzRegex *regex = rz_regex_new(r, "e"); + RzRegex *regex = rz_regex_new(r, RZ_REGEX_EXTENDED, 0); RzList *res = str_split_list_common_regex(str, regex, n, trim, true); free(str); rz_regex_free(regex); diff --git a/meson.build b/meson.build index 5b68819aab2..a55a9e0b50e 100644 --- a/meson.build +++ b/meson.build @@ -195,6 +195,24 @@ else add_project_arguments(['-DUSE_SYS_CAPSTONE'], language: 'c') endif +# Handle PCRE2 +cpu_jit_supported = [ 'aarch64', 'arm', 'mips', 'mips64', 'ppc', 'ppc64', 'riscv32', 'riscv64', 's390x', 'x86', 'x86_64' ] +pcre2_jit_supported = target_machine.cpu_family() in cpu_jit_supported and cc.get_id() != 'tcc' +if pcre2_jit_supported + add_project_arguments(['-DSUPPORTS_PCRE2_JIT'], language: 'c') +endif + +pcre2_dep_opt = get_option('use_sys_pcre2') +pcre2_dep = disabler() +if pcre2_dep_opt.enabled() or pcre2_dep_opt.auto() + pcre2_dep = dependency('libpcre2-8', required: false, static: true) + if not pcre2_dep.found() + pcre2_dep = cc.find_library('pcre2', required: true, static: true) + endif +else + pcre2_dep = dependency('pcre2', 'pcre2_dep', version: '>=10.42', required: true, static: true) +endif + # handle magic library sys_magic_opt = get_option('use_sys_magic') sys_magic = disabler() @@ -339,6 +357,8 @@ foreach it : ccs it_userconf.set10('IS_PORTABLE', get_option('portable')) it_userconf.set10('HAVE_LIB_MAGIC', sys_magic.found()) it_userconf.set10('USE_LIB_MAGIC', sys_magic.found()) + it_userconf.set10('HAVE_LIB_PCRE2', pcre2_dep.found()) + it_userconf.set10('USE_LIB_PCRE2', pcre2_dep.found()) it_userconf.set10('HAVE_LIB_XXHASH', xxhash_dep.found()) it_userconf.set10('USE_LIB_XXHASH', xxhash_dep.found()) it_userconf.set10('DEBUGGER', has_debugger) @@ -380,6 +400,7 @@ foreach it : ccs it_userconf.set10('HAVE_PTHREAD', have_pthread) it_userconf.set10('HAVE_LZMA', get_option('use_lzma')) it_userconf.set10('HAVE_ZLIB', get_option('use_zlib')) + it_userconf.set10('SUPPORTS_PCRE2_JIT', pcre2_jit_supported) if it_machine.system() == 'freebsd' or it_machine.system() == 'dragonfly' add_project_link_arguments('-Wl,--unresolved-symbols,ignore-in-object-files', language: 'c', native: it_native) @@ -749,7 +770,10 @@ summary({ 'Swift demangler': get_option('use_swift_demangler'), 'Debugger enabled': has_debugger, 'Capstone version': capstone_dep.version(), + 'PCRE2 version': pcre2_dep.version(), + 'PCRE2 JIT': pcre2_jit_supported, 'System magic library': sys_magic.found() and sys_magic.type_name() != 'internal', + 'System pcre2 library': pcre2_dep.found() and pcre2_dep.type_name() != 'internal', 'System xxhash library': xxhash_dep.found() and xxhash_dep.type_name() != 'internal', 'System libmspack library': libmspack_dep.found() and libmspack_dep.type_name() != 'internal', 'System openssl library': sys_openssl.found() and sys_openssl.type_name() != 'internal', diff --git a/meson_options.txt b/meson_options.txt index e51701f7686..7abaf40e1ab 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -34,6 +34,7 @@ option('use_sys_xxhash', type: 'feature', value: 'disabled') option('use_sys_openssl', type: 'feature', value: 'disabled') option('use_sys_libmspack', type: 'feature', value: 'disabled') option('use_sys_tree_sitter', type: 'feature', value: 'disabled') +option('use_sys_pcre2', type: 'feature', value: 'disabled') option('use_swift_demangler', type: 'boolean', value: true, description: 'If false, disables the swift demangler') option('use_gpl', type: 'boolean', value: true, description: 'Set to false when you want to disable gpl code') option('install_sigdb', type: 'boolean', value: false, description: 'Downloads and installs rizin sigdb') diff --git a/subprojects/packagefiles/pcre2/meson.build b/subprojects/packagefiles/pcre2/meson.build new file mode 100644 index 00000000000..1fcc5752388 --- /dev/null +++ b/subprojects/packagefiles/pcre2/meson.build @@ -0,0 +1,85 @@ +project('pcre2', 'c', version: '10.42') + +cc = meson.get_compiler('c') + +conf_data = configuration_data() + +pcre2_chartables = configure_file(input : 'src/pcre2_chartables.c.dist', + output : 'pcre2_chartables.c', + configuration : conf_data) + +pcre2_h = configure_file(input : 'src/pcre2.h.generic', + output : 'pcre2.h', + configuration : conf_data) + +config_h = configure_file(input : 'src/config.h.generic', + output : 'config.h', + configuration : conf_data) + +libpcre2_c_args = [ + '-DHAVE_CONFIG_H', # Default values from config.h + '-DPCRE2_CODE_UNIT_WIDTH=8', + '-DHAVE_MEMMOVE', + '-DSUPPORT_PCRE2_8', + '-DSUPPORT_UNICODE', + '-fvisibility=default', +] + +pcre2_files = [ + 'src/pcre2_auto_possess.c', + pcre2_chartables, + 'src/pcre2_compile.c', + 'src/pcre2_config.c', + 'src/pcre2_context.c', + 'src/pcre2_convert.c', + 'src/pcre2_dfa_match.c', + 'src/pcre2_error.c', + 'src/pcre2_extuni.c', + 'src/pcre2_find_bracket.c', + 'src/pcre2_maketables.c', + 'src/pcre2_match.c', + 'src/pcre2_match_data.c', + 'src/pcre2_newline.c', + 'src/pcre2_ord2utf.c', + 'src/pcre2_pattern_info.c', + 'src/pcre2_script_run.c', + 'src/pcre2_serialize.c', + 'src/pcre2_string_utils.c', + 'src/pcre2_study.c', + 'src/pcre2_substitute.c', + 'src/pcre2_substring.c', + 'src/pcre2_tables.c', + 'src/pcre2_ucd.c', + 'src/pcre2_valid_utf.c', + 'src/pcre2_xclass.c', +] + +cpu_jit_supported = [ 'aarch64', 'arm', 'mips', 'mips64', 'ppc', 'ppc64', 'riscv32', 'riscv64', 's390x', 'x86', 'x86_64' ] + +# tcc doesn't support the MSVC asm syntax PCRE2 uses (`__asm { ... }`). +# It is used in the JIT compiler code. +if cc.get_id() != 'tcc' and target_machine.cpu_family() in cpu_jit_supported + libpcre2_c_args += ['-DSUPPORT_JIT'] + pcre2_files += ['src/pcre2_jit_compile.c'] +endif + +if target_machine.system() == 'openbsd' or target_machine.system() == 'netbsd' + # jit compilation fails with "no more memory" if wx allocations are allowed. + libpcre2_c_args += ['-DSLJIT_WX_EXECUTABLE_ALLOCATOR'] +endif + +pcre2_includes = [ + include_directories('.'), + include_directories('src/'), +] + +libpcre2 = static_library('pcre2', pcre2_files, + c_args: libpcre2_c_args, + include_directories: pcre2_includes, + install: false, +) + +pcre2_dep = declare_dependency( + link_with: libpcre2, + include_directories: pcre2_includes +) diff --git a/subprojects/pcre2.wrap b/subprojects/pcre2.wrap new file mode 100644 index 00000000000..2bfaf19a850 --- /dev/null +++ b/subprojects/pcre2.wrap @@ -0,0 +1,8 @@ +[wrap-git] +url = https://github.com/PCRE2Project/pcre2.git +revision = 52c08847921a324c804cabf2814549f50bce1265 +directory = pcre2 +patch_directory = pcre2 + +[provide] +pcre2=pcre2_dep diff --git a/test/db/archos/darwin-arm64/dbg b/test/db/archos/darwin-arm64/dbg index 21e2b25dae7..4c444491c7d 100644 --- a/test/db/archos/darwin-arm64/dbg +++ b/test/db/archos/darwin-arm64/dbg @@ -31,10 +31,14 @@ stur w0, [fp, -4] stur x1, [fp, -0x10] str x8, [sp, 0x10] EOF -REGEXP_FILTER_ERR=(([a-zA-Z:]+|[0-9a-f][0-9a-f][0-9a-f])\s+) +REGEXP_FILTER_ERR=(((Continue\suntil)|(hit\sbreakpoint\sat:)|[0-9a-f][0-9a-f][0-9a-f]\n)) EXPECT_ERR=< 0x0000421a mov qword [0x000232b0], 0x50 ; 'P' |  ; [0x232b0:8]=0 - | 0x00004225 mov rax, qword str.COLUMNS ; [0x18a22:8]=0x534e4d554c4f43 ; "COLUMNS" + | 0x00004225 mov rax, qword [str.COLUMNS] ; [0x18a22:8]=0x534e4d554c4f43 ; "COLUMNS" 0x000041ee mov rax, qword [rip + 0x1d84b] ; [0x21a40:8]=0x18d31 str.literal 0x00004225 mov rax, qword [rip + 0x147f6] ; str.COLUMNS @@ -709,7 +709,7 @@ EXPECT=< 0x0000421a mov qword [0x000232b0], 0x50 ; 'P' |  ; [0x232b0:8]=0 - | 0x00004225 cmp rdi, qword str.COLUMNS ; [0x18a22:8]=0x534e4d554c4f43 ; "COLUMNS" + | 0x00004225 cmp rdi, qword [str.COLUMNS] ; [0x18a22:8]=0x534e4d554c4f43 ; "COLUMNS" 0x000041ee cmp rsi, qword [rip + 0x1d84b] ; [0x21a40:8]=0x18d31 str.literal 0x00004225 cmp rdi, qword [rip + 0x147f6] ; str.COLUMNS diff --git a/test/db/formats/pdb b/test/db/formats/pdb index 747aca9a671..4863eb96546 100644 --- a/test/db/formats/pdb +++ b/test/db/formats/pdb @@ -531,7 +531,7 @@ mkdir .tmp idpx bins/pdb/basic32.pd_ .tmp !rz-hash -a md5 .tmp/basic32.pdb EOF -REGEXP_FILTER_ERR=(ERROR:.+\nINFO:.+\n) +REGEXP_FILTER_ERR=(ERROR:.+\nINFO:.+) EXPECT_ERR=< 0x00000040 POP_TOP EOF REGEXP_FILTER_ERR=< 0x0000003c POP_TOP EOF REGEXP_FILTER_ERR=<.hello_world EOF REGEXP_FILTER_ERR=< // SPDX-License-Identifier: LGPL-3.0-only -#include #include "minunit.h" +#include +#include +#include +#include -bool exec_regex(RzRegex *regex, const char *str, RzRegexMatch *out) { - RzRegexMatch match[2]; - mu_assert_true(rz_regex_exec(regex, str, 1, &match[0], 0) == 0, "Regex match failed"); - mu_assert_true(rz_regex_exec(regex, str, 1, &match[1], RZ_REGEX_LARGE) == 0, "Regex match failed for large engine"); - mu_assert_memeq((ut8 *)&match[0], (ut8 *)&match[1], sizeof(RzRegexMatch), "Results from large engine match does not equal small engine match"); - *out = match[0]; +bool exec_regex(RzRegex *regex, const char *str, RzRegexMatch **out) { + RzPVector *matches = rz_regex_match_all_not_grouped(regex, str, RZ_REGEX_ZERO_TERMINATED, 0, RZ_REGEX_DEFAULT); + if (!matches || rz_pvector_empty(matches)) { + return false; + } + *out = (RzRegexMatch *)rz_pvector_at(matches, 0); return true; } +bool test_rz_regex_all_match(void) { + RzRegex *reg = rz_regex_new("push", RZ_REGEX_EXTENDED, 0); + mu_assert_notnull(reg, "Regex was NULL"); + RzRegexMatch *match = NULL; + mu_assert_true(exec_regex(reg, "push", &match), "Regex match failed"); + mu_assert_notnull(match, "match was not set"); + mu_assert_eq(match->start, 0, "Start of match is not 0"); + mu_assert_eq(match->len, 4, "Len of match is not 4"); + rz_regex_free(reg); + mu_end; +} + +bool test_rz_regex_extend_space(void) { + RzRegex *reg = rz_regex_new("push esi", RZ_REGEX_DEFAULT, 0); + mu_assert_notnull(reg, "Regex was NULL"); + RzRegexMatch *match = NULL; + mu_assert_notnull(reg, "Regex was NULL"); + mu_assert_true(exec_regex(reg, "push esi", &match), "Regex match failed. Was ' ' replaced with \\s in the pattern?"); + mu_assert_notnull(match, "match was not set"); + mu_assert_eq(match->start, 0, "Start of match is not 0"); + mu_assert_eq(match->len, 8, "Len of match is not 8"); + rz_regex_free(reg); + mu_end; +} + +bool test_rz_regex_all_to_str(void) { + RzRegex *reg = rz_regex_new("123", RZ_REGEX_EXTENDED, 0); + mu_assert_notnull(reg, "Regex was NULL"); + RzStrBuf *res = rz_regex_full_match_str("(123)", "123 123 123", RZ_REGEX_ZERO_TERMINATED, RZ_REGEX_DEFAULT, RZ_REGEX_DEFAULT, "\n"); + char *str = rz_strbuf_drain(res); + mu_assert_streq(str, "123\n123\n123", "String match failed."); + free(str); + + res = rz_regex_full_match_str("(123)", "123", RZ_REGEX_ZERO_TERMINATED, RZ_REGEX_DEFAULT, RZ_REGEX_DEFAULT, "\n"); + str = rz_strbuf_drain(res); + mu_assert_streq(str, "123", "String match failed."); + free(str); + + res = rz_regex_full_match_str("(123)", "", RZ_REGEX_ZERO_TERMINATED, RZ_REGEX_DEFAULT, RZ_REGEX_DEFAULT, "\n"); + str = rz_strbuf_drain(res); + mu_assert_streq(str, "", "String match failed."); + free(str); + rz_regex_free(reg); + mu_end; +} + bool test_rz_reg_exec(void) { const char *p = "abc|123"; - RzRegex *reg = rz_regex_new(p, "e"); + RzRegex *reg = rz_regex_new(p, RZ_REGEX_EXTENDED, 0); mu_assert_notnull(reg, "Regex was NULL"); - RzRegexMatch match; + RzRegexMatch *match = NULL; mu_assert_true(exec_regex(reg, "abc", &match), "Regex match failed"); - mu_assert_eq(match.rm_so, 0, "Start of match is not 0"); - mu_assert_eq(match.rm_eo, 3, "Start of match is not 3"); + mu_assert_notnull(match, "match was not set"); + mu_assert_eq(match->start, 0, "Start of match is not 0"); + mu_assert_eq(match->len, 3, "Len of match is not 3"); mu_assert_true(exec_regex(reg, "zabc", &match), "Regex match failed"); - mu_assert_eq(match.rm_so, 1, "Start of match is not 1"); - mu_assert_eq(match.rm_eo, 4, "Start of match is not 4"); + mu_assert_notnull(match, "match was not set"); + mu_assert_eq(match->start, 1, "Start of match is not 1"); + mu_assert_eq(match->len, 3, "Len of match is not 3"); mu_assert_true(exec_regex(reg, "abcz", &match), "Regex match failed"); - mu_assert_eq(match.rm_so, 0, "Start of match is not 0"); - mu_assert_eq(match.rm_eo, 3, "Start of match is not 3"); + mu_assert_notnull(match, "match was not set"); + mu_assert_eq(match->start, 0, "Start of match is not 0"); + mu_assert_eq(match->len, 3, "Len of match is not 3"); mu_assert_true(exec_regex(reg, "123", &match), "Regex match failed"); - mu_assert_eq(match.rm_so, 0, "Start of match is not 0"); - mu_assert_eq(match.rm_eo, 3, "Start of match is not 3"); + mu_assert_notnull(match, "match was not set"); + mu_assert_eq(match->start, 0, "Start of match is not 0"); + mu_assert_eq(match->len, 3, "Len of match is not 3"); mu_assert_true(exec_regex(reg, "z123", &match), "Regex match failed"); - mu_assert_eq(match.rm_so, 1, "Start of match is not 1"); - mu_assert_eq(match.rm_eo, 4, "Start of match is not 4"); + mu_assert_notnull(match, "match was not set"); + mu_assert_eq(match->start, 1, "Start of match is not 1"); + mu_assert_eq(match->len, 3, "Len of match is not 3"); mu_assert_true(exec_regex(reg, "123z", &match), "Regex match failed"); - mu_assert_eq(match.rm_so, 0, "Start of match is not 0"); - mu_assert_eq(match.rm_eo, 3, "Start of match is not 3"); + mu_assert_notnull(match, "match was not set"); + mu_assert_eq(match->start, 0, "Start of match is not 0"); + mu_assert_eq(match->len, 3, "Len of match is not 3"); rz_regex_free(reg); const char *p_big = "\\d+(([abc]*d[efg])+|[123]4[567]+)*|[zyx]+(test)+[mnb]"; - reg = rz_regex_new(p_big, "e"); + reg = rz_regex_new(p_big, RZ_REGEX_EXTENDED, 0); mu_assert_true(exec_regex(reg, "z1abcde123z", &match), "Regex match failed"); - mu_assert_eq(match.rm_so, 1, "Start of match is not 1"); - mu_assert_eq(match.rm_eo, 7, "Start of match is not 7"); + mu_assert_notnull(match, "match was not set"); + mu_assert_eq(match->start, 1, "Start of match is not 1"); + mu_assert_eq(match->len, 6, "Len of match is not 6"); mu_assert_true(exec_regex(reg, "ayztesttestb123z", &match), "Regex match failed"); - mu_assert_eq(match.rm_so, 1, "Start of match is not 1"); - mu_assert_eq(match.rm_eo, 12, "Start of match is not 11"); + mu_assert_notnull(match, "match was not set"); + mu_assert_eq(match->start, 1, "Start of match is not 1"); + mu_assert_eq(match->len, 11, "Len of match is not 11"); rz_regex_free(reg); mu_end; } @@ -53,38 +110,66 @@ bool test_rz_reg_exec(void) { bool test_rz_regex_capture(void) { char *str = "abcd PrefixHello42s xyz"; - RzRegex *re = rz_regex_new("[a-zA-Z]*(H[a-z]+)([0-9]*)s", "e"); + RzRegex *re = rz_regex_new("[a-zA-Z]*(H[a-z]+)([0-9]*)s", RZ_REGEX_EXTENDED, 0); mu_assert_notnull(re, "regex_new"); - RzRegexMatch groups[4]; - int r = rz_regex_exec(re, str, RZ_ARRAY_SIZE(groups), groups, 0); - mu_assert_eq(r, 0, "regex_exec"); + RzPVector *matches = rz_regex_match_all_not_grouped(re, str, RZ_REGEX_ZERO_TERMINATED, 0, RZ_REGEX_DEFAULT); + mu_assert_true(matches && !rz_pvector_empty(matches), "Regex match failed"); + mu_assert_eq(rz_pvector_len(matches), 3, "Regex match count failed."); - mu_assert_eq(groups[0].rm_so, 5, "full match start"); - mu_assert_eq(groups[0].rm_eo, 19, "full match end"); - char *s = rz_regex_match_extract(str, &groups[0]); + RzRegexMatch *match = rz_pvector_at(matches, 0); + mu_assert_eq(match->start, 5, "full match start"); + mu_assert_eq(match->len, 14, "full match len"); + char *s = rz_str_ndup(str + match->start, match->len); mu_assert_streq_free(s, "PrefixHello42s", "full match extract"); - mu_assert_eq(groups[1].rm_so, 11, "capture 1 start"); - mu_assert_eq(groups[1].rm_eo, 16, "capture 1 end"); - s = rz_regex_match_extract(str, &groups[1]); + match = rz_pvector_at(matches, 1); + mu_assert_eq(match->start, 11, "capture 1 start"); + mu_assert_eq(match->len, 5, "capture 1 len"); + s = rz_str_ndup(str + match->start, match->len); mu_assert_streq_free(s, "Hello", "capture 1 extract"); - mu_assert_eq(groups[2].rm_so, 16, "capture 2 start"); - mu_assert_eq(groups[2].rm_eo, 18, "capture 2 end"); - s = rz_regex_match_extract(str, &groups[2]); + match = rz_pvector_at(matches, 2); + mu_assert_eq(match->start, 16, "capture 2 start"); + mu_assert_eq(match->len, 2, "capture 2 len"); + s = rz_str_ndup(str + match->start, match->len); mu_assert_streq_free(s, "42", "capture 2 extract"); - mu_assert_eq(groups[3].rm_so, -1, "capture 3 start"); - mu_assert_eq(groups[3].rm_eo, -1, "capture 3 end"); - s = rz_regex_match_extract(str, &groups[3]); - mu_assert_null(s, "capture 3 extract"); - rz_regex_free(re); mu_end; } +bool test_rz_regex_named_matches(void) { + RzRegex *reg = rz_regex_new("(?^\\w+)(:\\/\\/)(?\\w+)\\.(?\\w+)", RZ_REGEX_EXTENDED, 0); + mu_assert_notnull(reg, "Regex was NULL"); + mu_assert_streq((char *)rz_regex_get_match_name(reg, 1), "proto", "proto name not set."); + mu_assert_streq((char *)rz_regex_get_match_name(reg, 3), "domain", "domain name not set."); + mu_assert_streq((char *)rz_regex_get_match_name(reg, 4), "tdomain", "tdomain name not set."); + + RzPVector *matches = rz_regex_match_all_not_grouped(reg, "https://rizin.re", RZ_REGEX_ZERO_TERMINATED, 0, RZ_REGEX_DEFAULT); + mu_assert_true(matches && !rz_pvector_empty(matches), "Regex match failed"); + mu_assert_eq(rz_pvector_len(matches), 5, "Regex match count failed."); + + RzRegexMatch *match = rz_pvector_at(matches, 0); + mu_assert_streq((char *)rz_regex_get_match_name(reg, match->group_idx), "(null)", "(null) was not matched."); + match = rz_pvector_at(matches, 1); + mu_assert_streq((char *)rz_regex_get_match_name(reg, match->group_idx), "proto", "proto was not matched."); + match = rz_pvector_at(matches, 2); + mu_assert_streq((char *)rz_regex_get_match_name(reg, match->group_idx), "(null)", "(null) was not matched."); + match = rz_pvector_at(matches, 3); + mu_assert_streq((char *)rz_regex_get_match_name(reg, match->group_idx), "domain", "domain was not matched."); + match = rz_pvector_at(matches, 4); + mu_assert_streq((char *)rz_regex_get_match_name(reg, match->group_idx), "tdomain", "tdomain was not matched."); + + rz_regex_free(reg); + mu_end; +} + int main() { + mu_run_test(test_rz_regex_all_match); + mu_run_test(test_rz_regex_extend_space); mu_run_test(test_rz_reg_exec); mu_run_test(test_rz_regex_capture); + mu_run_test(test_rz_regex_all_to_str); + mu_run_test(test_rz_regex_named_matches); } diff --git a/test/unit/test_str.c b/test/unit/test_str.c index c412b5129d5..c33c382013c 100644 --- a/test/unit/test_str.c +++ b/test/unit/test_str.c @@ -212,7 +212,7 @@ bool test_rz_str_split_list(void) { rz_list_free(l); char s1[] = "Hello World\tAnd \t Everyone"; - RzList *l1 = rz_str_split_duplist_n_regex(s1, "[[:blank:]]+", 0, false); + RzList *l1 = rz_str_split_duplist_n_regex(s1, "\\s+", 0, false); mu_assert_eq(rz_list_length(l1), 4, "string has been split in 4 items"); mu_assert_streq(rz_list_get_n(l1, 0), "Hello", "first item"); mu_assert_streq(rz_list_get_n(l1, 1), "World", "second item"); @@ -228,7 +228,7 @@ bool test_rz_str_split_list(void) { rz_list_free(l2); char s3[] = "Hello World\tAnd \t Everyone\t"; - RzList *l3 = rz_str_split_list_regex(s3, "[[:blank:]]+", 0); + RzList *l3 = rz_str_split_list_regex(s3, "\\s+", 0); mu_assert_eq(rz_list_length(l3), 4, "string has been split in 4 items"); mu_assert_streq(rz_list_get_n(l3, 0), "Hello", "first item"); mu_assert_streq(rz_list_get_n(l3, 1), "World", "second item");