Skip to content

Commit

Permalink
Replace current regex engine with PCRE2 (#4185)
Browse files Browse the repository at this point in the history
* Replace OpenBSD regex library with PCRE2.

PCRE2 has way better performance than the OpenBSD
library (something around 20 times faster).

The following flags are enabled for every pattern:

- PCRE2_UTF
- PCRE2_MATCH_INVALID_UTF
- PCRE2_NO_UTF_CHECK

All the others are optional.

Changes made:

- Adds PCRE2 as subproject.
- Changes the API away from POSIX to PCRE2.
- Edits many regex patterns because:
 - ' ' is skipped in patterns, if the EXTENDED flag is set for matching. '\s' must be set now.
 - '.' doesn't match newlines by default.
- Changes the API so matches and their groups are bundled into PVectors.
- Moves the regex component to rz_util.

* Fix cross build - add copy of PCRE2 dependecy

Meson currently doesn't support subprojects to be native and non-native at the same time.
See: mesonbuild/meson#10947
Unfortunately, sdb depends on rz_util which in turn depends on PCRE2.
Excluding PCRE2 from the native build makes linking of rz_util not possible anymore.
Adding it, will make Meson complain that the dependencies cannot be mixed.

Hence, we compile a copy of PCRE2 for the native build if required.
  • Loading branch information
Rot127 committed Feb 5, 2024
1 parent df6b6a9 commit 5afc51f
Show file tree
Hide file tree
Showing 65 changed files with 1,312 additions and 5,669 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ peda-session-*
.cache/
test/.tmp/*
subprojects/capstone-*/
subprojects/pcre2/
subprojects/libzip-*/
subprojects/lz4-*/
subprojects/packagecache/
Expand Down
5 changes: 0 additions & 5 deletions .reuse/dep5
Original file line number Diff line number Diff line change
Expand Up @@ -188,11 +188,6 @@ Copyright: 1986-1995 Ian F. Darwin
1995-present Christos Zoulas and others
License: BSD-2-Clause

Files: librz/util/regex/*
Copyright: 1992, 1993, 1994 Henry Spencer
1992, 1993, 1994 The Regents of the University of California
License: BSD-3-Clause

Files: subprojects/rzheap/rz_jemalloc/*
Copyright: 2002-present Jason Evans <[email protected]>
2007-2012 Mozilla Foundation.
Expand Down
17 changes: 12 additions & 5 deletions binrz/rz-test/run.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
// SPDX-License-Identifier: LGPL-3.0-only

#include "rz_test.h"
#include <rz_util/rz_str.h>
#include <rz_util/rz_regex.h>
#include <rz_cons.h>

#if __WINDOWS__
Expand Down Expand Up @@ -193,11 +195,16 @@ RZ_API RzSubprocessOutput *rz_test_run_cmd_test(RzTestRunConfig *config, RzCmdTe

RZ_API bool rz_test_cmp_cmd_output(const char *output, const char *expect, const char *regexp) {
if (regexp) {
RzList *matches = rz_regex_get_match_list(regexp, "e", output);
const char *match = rz_list_to_str(matches, '\0');
bool equal = (0 == strcmp(expect, match));
rz_list_free(matches);
RZ_FREE(match);
RzStrBuf *match_str = rz_regex_full_match_str(regexp, output, RZ_REGEX_ZERO_TERMINATED, RZ_REGEX_EXTENDED, RZ_REGEX_DEFAULT, "\n");
bool equal = false;
ut32 expect_len = strlen(expect);
if (expect_len > 0 && expect[expect_len - 1] == '\n') {
// Ignore newline
equal = (rz_str_cmp(expect, rz_strbuf_get(match_str), expect_len - 1) == 0);
} else {
equal = RZ_STR_EQ(expect, rz_strbuf_get(match_str));
}
rz_strbuf_free(match_str);
return equal;
}
return (0 == strcmp(expect, output));
Expand Down
5 changes: 2 additions & 3 deletions binrz/rz-test/rz-test.c
Original file line number Diff line number Diff line change
Expand Up @@ -743,9 +743,8 @@ static void print_diff(const char *actual, const char *expected, const char *reg
const char *output = actual;

if (regexp) {
RzList *matches = rz_regex_get_match_list(regexp, "e", actual);
output = rz_list_to_str(matches, '\0');
rz_list_free(matches);
RzStrBuf *match_str = rz_regex_full_match_str(regexp, actual, RZ_REGEX_ZERO_TERMINATED, RZ_REGEX_EXTENDED, RZ_REGEX_DEFAULT, "\n");
output = rz_strbuf_drain(match_str);
}

d = rz_diff_lines_new(expected, output, NULL);
Expand Down
8 changes: 6 additions & 2 deletions librz/asm/arch/hexagon/hexagon_arch.c
Original file line number Diff line number Diff line change
Expand Up @@ -888,13 +888,17 @@ RZ_API void hexagon_reverse_opcode(const RzAsm *rz_asm, HexReversedOpcode *rz_re
memcpy(rz_reverse->ana_op, &hic->ana_op, sizeof(RzAnalysisOp));
rz_strbuf_set(&rz_reverse->asm_op->buf_asm, hic->text);
rz_reverse->asm_op->asm_toks = rz_asm_tokenize_asm_regex(&rz_reverse->asm_op->buf_asm, state->token_patterns);
rz_reverse->asm_op->asm_toks->op_type = hic->ana_op.type;
if (rz_reverse->asm_op->asm_toks) {
rz_reverse->asm_op->asm_toks->op_type = hic->ana_op.type;
}
break;
case HEXAGON_DISAS:
memcpy(rz_reverse->asm_op, &hic->asm_op, sizeof(RzAsmOp));
rz_strbuf_set(&rz_reverse->asm_op->buf_asm, hic->text);
rz_reverse->asm_op->asm_toks = rz_asm_tokenize_asm_regex(&rz_reverse->asm_op->buf_asm, state->token_patterns);
rz_reverse->asm_op->asm_toks->op_type = hic->ana_op.type;
if (rz_reverse->asm_op->asm_toks) {
rz_reverse->asm_op->asm_toks->op_type = hic->ana_op.type;
}
break;
case HEXAGON_ANALYSIS:
memcpy(rz_reverse->ana_op, &hic->ana_op, sizeof(RzAnalysisOp));
Expand Down
27 changes: 13 additions & 14 deletions librz/asm/asm.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#include "rz_util/rz_print.h"
#include <rz_vector.h>
#include <rz_util/rz_strbuf.h>
#include <rz_regex.h>
#include <rz_util/rz_regex.h>
#include <rz_util/rz_assert.h>
#include <rz_list.h>
#include <stdio.h>
Expand Down Expand Up @@ -1545,7 +1545,7 @@ RZ_API void rz_asm_compile_token_patterns(RZ_INOUT RzPVector /*<RzAsmTokenPatter
rz_pvector_foreach (patterns, it) {
RzAsmTokenPattern *pat = *it;
if (!pat->regex) {
pat->regex = rz_regex_new(pat->pattern, "e");
pat->regex = rz_regex_new(pat->pattern, RZ_REGEX_EXTENDED, 0);
if (!pat->regex) {
RZ_LOG_WARN("Did not compile regex pattern %s.\n", pat->pattern);
rz_warn_if_reached();
Expand Down Expand Up @@ -1584,32 +1584,31 @@ RZ_API RZ_OWN RzAsmTokenString *rz_asm_tokenize_asm_regex(RZ_BORROW RzStrBuf *as
}
}

/// Start pattern search from the beginning
size_t asm_str_off = 0;

// Search for token pattern.
RzRegexMatch match[1];
while (rz_regex_exec(pattern->regex, asm_str + asm_str_off, 1, match, 0) == 0) {
st64 match_start = match[0].rm_so; // Token start
st64 match_end = match[0].rm_eo; // Token end
st64 len = match_end - match_start; // Length of token
st64 tok_offset = asm_str_off + match_start; // Token offset in str
RzPVector *match_sets = rz_regex_match_all(pattern->regex, asm_str, RZ_REGEX_ZERO_TERMINATED, 0, RZ_REGEX_DEFAULT);
void **grouped_match;
rz_pvector_foreach (match_sets, grouped_match) {
if (rz_pvector_empty(*grouped_match)) {
continue;
}
RzRegexMatch *match = rz_pvector_at(*grouped_match, 0);
st64 match_start = match->start; // Token start
st64 len = match->len; // Length of token
st64 tok_offset = match_start; // Token offset in str
if (overlaps_with_token(toks->tokens, tok_offset, tok_offset + len - 1)) {
// If this is true a token with higher priority was matched before.
asm_str_off = tok_offset + len;
continue;
}

// New token found, add it.
if (!is_num(asm_str + tok_offset)) {
add_token(toks, tok_offset, len, pattern->type, 0);
asm_str_off = tok_offset + len;
continue;
}
ut64 number = strtoull(asm_str + tok_offset, NULL, 0);
add_token(toks, tok_offset, len, pattern->type, number);
asm_str_off = tok_offset + len;
}
rz_pvector_free(match_sets);
}

rz_vector_sort(toks->tokens, (RzVectorComparator)cmp_tokens, false, NULL);
Expand Down
6 changes: 3 additions & 3 deletions librz/asm/p/asm_bf.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,22 +30,22 @@ static RZ_OWN RzPVector /*<RzAsmTokenPattern *>*/ *get_token_patterns(RzAsm *a)
pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_REGISTER;
pat->pattern = strdup(
"(ptr)");
"ptr");
rz_pvector_push(pvec, pat);

// reference pattern
pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_OPERATOR;
pat->pattern = strdup(
"(\\[)|(\\])" // Matches a single bracket
"\\[|\\]" // Matches a single bracket
);
rz_pvector_push(pvec, pat);

// Separator pattern
pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_SEPARATOR;
pat->pattern = strdup(
"([[:blank:]]+)");
"\\s+");
rz_pvector_push(pvec, pat);

return pvec;
Expand Down
26 changes: 13 additions & 13 deletions librz/asm/p/asm_hexagon.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,23 +30,23 @@ static RZ_OWN RzPVector /*<RzAsmTokenPattern *>*/ *get_token_patterns(HexState *
RzAsmTokenPattern *pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_META;
pat->pattern = strdup(
"(^[\\[\\?\\/\\|\\\\\\{])|(┌)|(│)|(└)|" // Packet prefix
"((∎)|[<\\}])([ :])(endloop[01]{1,2})" // Endloop markers
"^[\\[\\?\\/\\|\\\\\\{┌│└]|" // Packet prefix
"(|[<\\}])[\\s:]endloop[01]{1,2}" // Endloop markers
);
rz_pvector_push(pvec, pat);

pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_META;
pat->pattern = strdup(
"(#{1,2})|(\\}$)|" // Immediate prefix, Closing packet bracket
"\\#{1,2}|\\}$|" // Immediate prefix, Closing packet bracket
"\\.new|:n?t|:raw|<err>" // .new and jump hints
);
rz_pvector_push(pvec, pat);

pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_REGISTER;
pat->pattern = strdup(
"([CNPRMQVO][[:digit:]]{1,2}(:[[:digit:]]{1,2})?(in)?)" // Registers and double registers
"[CNPRMQVO]\\d{1,2}(:\\d{1,2})?(in)?" // Registers and double registers
);
rz_pvector_push(pvec, pat);

Expand All @@ -60,51 +60,51 @@ static RZ_OWN RzPVector /*<RzAsmTokenPattern *>*/ *get_token_patterns(HexState *
pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_NUMBER;
pat->pattern = strdup(
"(0x[[:digit:]abcdef]+)" // Hexadecimal numbers
"0x(\\d|[abcdef])+" // Hexadecimal numbers
);
rz_pvector_push(pvec, pat);

pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_MNEMONIC;
pat->pattern = strdup(
"([[:alpha:]]+[[:digit:]]+[[:alpha:]]*)" // Mnemonics with a decimal number in the name.
"[a-zA-Z]+\\d+[a-zA-Z]*" // Mnemonics with a decimal number in the name.
);
rz_pvector_push(pvec, pat);

pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_NUMBER;
pat->pattern = strdup(
"([[:digit:]]+)" // Decimal numbers
"\\d+" // Decimal numbers
);
rz_pvector_push(pvec, pat);

pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_SEPARATOR;
pat->pattern = strdup(
"([[:blank:]]+)|" // Spaces and tabs
"([,;\\.\\(\\)\\{\\}:])" // Brackets and others
"\\s+|" // Spaces and tabs
"[,;\\.\\(\\)\\{\\}:]" // Brackets and others
);
rz_pvector_push(pvec, pat);

pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_OPERATOR;
pat->pattern = strdup(
"(\\+)|(=)|(!)|(-)" // +,-,=,],[, ! (not the packet prefix)
"[\\+=!-]" // +,-,=,],[, ! (not the packet prefix)
);
rz_pvector_push(pvec, pat);

pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_OPERATOR;
pat->pattern = strdup(
"(\\])|(\\[|<{1,2}|>{1,2})" // +,-,=,],[, ! (not the packet prefix)
"\\]|\\[|<{1,2}|>{1,2}" // +,-,=,],[, ! (not the packet prefix)
);
rz_pvector_push(pvec, pat);

pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_MNEMONIC;
pat->pattern = strdup(
"([[:alnum:]]+)|" // Alphanumeric mnemonics
"([[:alnum:]]+_[[:alnum:]]+)" // Menmonics with "_" e.g dealloc_return
"\\w+_\\w+|" // Menmonics with "_" e.g dealloc_return
"\\w+" // Alphanumeric mnemonics
);
rz_pvector_push(pvec, pat);

Expand Down
23 changes: 12 additions & 11 deletions librz/cons/less.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
// SPDX-License-Identifier: LGPL-3.0-only

#include <rz_cons.h>
#include <rz_regex.h>
#include <rz_util/rz_regex.h>
#include <rz_util.h>
#include "pager_private.h"
#include "rz_vector.h"

#define I(x) rz_cons_singleton()->x

Expand All @@ -31,7 +32,7 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) {
RzRegex *rx = NULL;
int w, h, ch, to, ui = 1, from = 0, i;
const char *sreg;
RzList **mla;
RzPVector **mla;

// rcons kills str after flushing the buffer, so we must keep a copy
char *ostr = strdup(str);
Expand All @@ -47,17 +48,14 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) {
if (lines_count < 1) {
mla = NULL;
} else {
mla = calloc(lines_count, sizeof(RzList *));
mla = calloc(lines_count, sizeof(RzPVector *));
if (!mla) {
free(p);
free(ostr);
free(lines);
return 0;
}
}
for (i = 0; i < lines_count; i++) {
mla[i] = rz_list_new();
}
rz_cons_set_raw(true);
rz_cons_show_cursor(false);
rz_cons_reset();
Expand All @@ -75,7 +73,7 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) {
ch = rz_cons_readchar();
if (exitkeys && strchr(exitkeys, ch)) {
for (i = 0; i < lines_count; i++) {
rz_list_free(mla[i]);
rz_pvector_free(mla[i]);
}
free(p);
free(mla);
Expand Down Expand Up @@ -129,7 +127,7 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) {
if (rx) {
rz_regex_free(rx);
}
rx = rz_regex_new(sreg, "");
rx = rz_regex_new(sreg, RZ_REGEX_EXTENDED | RZ_REGEX_MULTILINE, 0);
} else { /* we got an empty string */
from = pager_next_match(from, mla, lines_count);
break;
Expand All @@ -138,9 +136,12 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) {
break;
}
/* find all occurrences */
if (pager_all_matches(p, rx, mla, lines, lines_count)) {
from = pager_next_match(from, mla, lines_count);
RzPVector *matches = rz_regex_match_all_not_grouped(rx, str, RZ_REGEX_ZERO_TERMINATED, 0, RZ_REGEX_DEFAULT);
if (rz_pvector_empty(matches)) {
rz_pvector_free(matches);
break;
}
from = pager_next_match(from, mla, lines_count);
break;
case 'n': /* next match */
/* search already performed */
Expand All @@ -157,7 +158,7 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) {
}
}
for (i = 0; i < lines_count; i++) {
rz_list_free(mla[i]);
rz_pvector_free(mla[i]);
}
free(mla);
rz_regex_free(rx);
Expand Down
Loading

0 comments on commit 5afc51f

Please sign in to comment.