Skip to content

Commit

Permalink
Replace OpenBSD regex library with PCRE2.
Browse files Browse the repository at this point in the history
PCRE2 has way better performance than the OpenBSD
library (something around 20 times faster).

The following flags are enabled for every pattern:

- PCRE2_UTF
- PCRE2_MATCH_INVALID_UTF
- PCRE2_NO_UTF_CHECK

All the others are optional.

Changes made:

- Adds PCRE2 as subproject.
- Changes the API away from POSIX to PCRE2.
- Edits many regex patterns because:
 - ' ' is skipped in patterns, if the EXTENDED flag is set for matching. '\s' must be set now.
 - '.' doesn't match newlines by default.
- Changes the API so matches and their groups are bundled into PVectors.
- Moves the regex component to rz_util.
  • Loading branch information
Rot127 committed Feb 3, 2024
1 parent 6eeb9e4 commit d11a763
Show file tree
Hide file tree
Showing 63 changed files with 1,196 additions and 5,669 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ peda-session-*
.cache/
test/.tmp/*
subprojects/capstone-*/
subprojects/pcre2/
subprojects/libzip-*/
subprojects/lz4-*/
subprojects/packagecache/
Expand Down
5 changes: 0 additions & 5 deletions .reuse/dep5
Original file line number Diff line number Diff line change
Expand Up @@ -188,11 +188,6 @@ Copyright: 1986-1995 Ian F. Darwin
1995-present Christos Zoulas and others
License: BSD-2-Clause

Files: librz/util/regex/*
Copyright: 1992, 1993, 1994 Henry Spencer
1992, 1993, 1994 The Regents of the University of California
License: BSD-3-Clause

Files: subprojects/rzheap/rz_jemalloc/*
Copyright: 2002-present Jason Evans <[email protected]>
2007-2012 Mozilla Foundation.
Expand Down
17 changes: 12 additions & 5 deletions binrz/rz-test/run.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
// SPDX-License-Identifier: LGPL-3.0-only

#include "rz_test.h"
#include <rz_util/rz_str.h>
#include <rz_util/rz_regex.h>
#include <rz_cons.h>

#if __WINDOWS__
Expand Down Expand Up @@ -193,11 +195,16 @@ RZ_API RzSubprocessOutput *rz_test_run_cmd_test(RzTestRunConfig *config, RzCmdTe

RZ_API bool rz_test_cmp_cmd_output(const char *output, const char *expect, const char *regexp) {
if (regexp) {
RzList *matches = rz_regex_get_match_list(regexp, "e", output);
const char *match = rz_list_to_str(matches, '\0');
bool equal = (0 == strcmp(expect, match));
rz_list_free(matches);
RZ_FREE(match);
RzStrBuf *match_str = rz_regex_full_match_str(regexp, output, RZ_REGEX_ZERO_TERMINATED, RZ_REGEX_EXTENDED, RZ_REGEX_DEFAULT, "\n");
bool equal = false;
ut32 expect_len = strlen(expect);
if (expect_len > 0 && expect[expect_len - 1] == '\n') {
// Ignore newline
equal = (rz_str_cmp(expect, rz_strbuf_get(match_str), expect_len - 1) == 0);
} else {
equal = RZ_STR_EQ(expect, rz_strbuf_get(match_str));
}
rz_strbuf_free(match_str);
return equal;
}
return (0 == strcmp(expect, output));
Expand Down
5 changes: 2 additions & 3 deletions binrz/rz-test/rz-test.c
Original file line number Diff line number Diff line change
Expand Up @@ -743,9 +743,8 @@ static void print_diff(const char *actual, const char *expected, const char *reg
const char *output = actual;

if (regexp) {
RzList *matches = rz_regex_get_match_list(regexp, "e", actual);
output = rz_list_to_str(matches, '\0');
rz_list_free(matches);
RzStrBuf *match_str = rz_regex_full_match_str(regexp, actual, RZ_REGEX_ZERO_TERMINATED, RZ_REGEX_EXTENDED, RZ_REGEX_DEFAULT, "\n");
output = rz_strbuf_drain(match_str);
}

d = rz_diff_lines_new(expected, output, NULL);
Expand Down
8 changes: 6 additions & 2 deletions librz/asm/arch/hexagon/hexagon_arch.c
Original file line number Diff line number Diff line change
Expand Up @@ -888,13 +888,17 @@ RZ_API void hexagon_reverse_opcode(const RzAsm *rz_asm, HexReversedOpcode *rz_re
memcpy(rz_reverse->ana_op, &hic->ana_op, sizeof(RzAnalysisOp));
rz_strbuf_set(&rz_reverse->asm_op->buf_asm, hic->text);
rz_reverse->asm_op->asm_toks = rz_asm_tokenize_asm_regex(&rz_reverse->asm_op->buf_asm, state->token_patterns);
rz_reverse->asm_op->asm_toks->op_type = hic->ana_op.type;
if (rz_reverse->asm_op->asm_toks) {
rz_reverse->asm_op->asm_toks->op_type = hic->ana_op.type;
}
break;
case HEXAGON_DISAS:
memcpy(rz_reverse->asm_op, &hic->asm_op, sizeof(RzAsmOp));
rz_strbuf_set(&rz_reverse->asm_op->buf_asm, hic->text);
rz_reverse->asm_op->asm_toks = rz_asm_tokenize_asm_regex(&rz_reverse->asm_op->buf_asm, state->token_patterns);
rz_reverse->asm_op->asm_toks->op_type = hic->ana_op.type;
if (rz_reverse->asm_op->asm_toks) {
rz_reverse->asm_op->asm_toks->op_type = hic->ana_op.type;
}
break;
case HEXAGON_ANALYSIS:
memcpy(rz_reverse->ana_op, &hic->ana_op, sizeof(RzAnalysisOp));
Expand Down
27 changes: 13 additions & 14 deletions librz/asm/asm.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#include "rz_util/rz_print.h"
#include <rz_vector.h>
#include <rz_util/rz_strbuf.h>
#include <rz_regex.h>
#include <rz_util/rz_regex.h>
#include <rz_util/rz_assert.h>
#include <rz_list.h>
#include <stdio.h>
Expand Down Expand Up @@ -1545,7 +1545,7 @@ RZ_API void rz_asm_compile_token_patterns(RZ_INOUT RzPVector /*<RzAsmTokenPatter
rz_pvector_foreach (patterns, it) {
RzAsmTokenPattern *pat = *it;
if (!pat->regex) {
pat->regex = rz_regex_new(pat->pattern, "e");
pat->regex = rz_regex_new(pat->pattern, RZ_REGEX_EXTENDED, 0);
if (!pat->regex) {
RZ_LOG_WARN("Did not compile regex pattern %s.\n", pat->pattern);
rz_warn_if_reached();
Expand Down Expand Up @@ -1584,32 +1584,31 @@ RZ_API RZ_OWN RzAsmTokenString *rz_asm_tokenize_asm_regex(RZ_BORROW RzStrBuf *as
}
}

/// Start pattern search from the beginning
size_t asm_str_off = 0;

// Search for token pattern.
RzRegexMatch match[1];
while (rz_regex_exec(pattern->regex, asm_str + asm_str_off, 1, match, 0) == 0) {
st64 match_start = match[0].rm_so; // Token start
st64 match_end = match[0].rm_eo; // Token end
st64 len = match_end - match_start; // Length of token
st64 tok_offset = asm_str_off + match_start; // Token offset in str
RzPVector *match_sets = rz_regex_match_all(pattern->regex, asm_str, RZ_REGEX_ZERO_TERMINATED, 0, RZ_REGEX_DEFAULT);
void **grouped_match;
rz_pvector_foreach (match_sets, grouped_match) {
if (rz_pvector_empty(*grouped_match)) {
continue;
}
RzRegexMatch *match = rz_pvector_at(*grouped_match, 0);
st64 match_start = match->start; // Token start
st64 len = match->len; // Length of token
st64 tok_offset = match_start; // Token offset in str
if (overlaps_with_token(toks->tokens, tok_offset, tok_offset + len - 1)) {
// If this is true a token with higher priority was matched before.
asm_str_off = tok_offset + len;
continue;
}

// New token found, add it.
if (!is_num(asm_str + tok_offset)) {
add_token(toks, tok_offset, len, pattern->type, 0);
asm_str_off = tok_offset + len;
continue;
}
ut64 number = strtoull(asm_str + tok_offset, NULL, 0);
add_token(toks, tok_offset, len, pattern->type, number);
asm_str_off = tok_offset + len;
}
rz_pvector_free(match_sets);
}

rz_vector_sort(toks->tokens, (RzVectorComparator)cmp_tokens, false, NULL);
Expand Down
6 changes: 3 additions & 3 deletions librz/asm/p/asm_bf.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,22 +30,22 @@ static RZ_OWN RzPVector /*<RzAsmTokenPattern *>*/ *get_token_patterns(RzAsm *a)
pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_REGISTER;
pat->pattern = strdup(
"(ptr)");
"ptr");
rz_pvector_push(pvec, pat);

// reference pattern
pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_OPERATOR;
pat->pattern = strdup(
"(\\[)|(\\])" // Matches a single bracket
"\\[|\\]" // Matches a single bracket
);
rz_pvector_push(pvec, pat);

// Separator pattern
pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_SEPARATOR;
pat->pattern = strdup(
"([[:blank:]]+)");
"\\s+");
rz_pvector_push(pvec, pat);

return pvec;
Expand Down
26 changes: 13 additions & 13 deletions librz/asm/p/asm_hexagon.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,23 +30,23 @@ static RZ_OWN RzPVector /*<RzAsmTokenPattern *>*/ *get_token_patterns(HexState *
RzAsmTokenPattern *pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_META;
pat->pattern = strdup(
"(^[\\[\\?\\/\\|\\\\\\{])|(┌)|(│)|(└)|" // Packet prefix
"((∎)|[<\\}])([ :])(endloop[01]{1,2})" // Endloop markers
"^[\\[\\?\\/\\|\\\\\\{┌│└]|" // Packet prefix
"(|[<\\}])[\\s:]endloop[01]{1,2}" // Endloop markers
);
rz_pvector_push(pvec, pat);

pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_META;
pat->pattern = strdup(
"(#{1,2})|(\\}$)|" // Immediate prefix, Closing packet bracket
"\\#{1,2}|\\}$|" // Immediate prefix, Closing packet bracket
"\\.new|:n?t|:raw|<err>" // .new and jump hints
);
rz_pvector_push(pvec, pat);

pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_REGISTER;
pat->pattern = strdup(
"([CNPRMQVO][[:digit:]]{1,2}(:[[:digit:]]{1,2})?(in)?)" // Registers and double registers
"[CNPRMQVO]\\d{1,2}(:\\d{1,2})?(in)?" // Registers and double registers
);
rz_pvector_push(pvec, pat);

Expand All @@ -60,51 +60,51 @@ static RZ_OWN RzPVector /*<RzAsmTokenPattern *>*/ *get_token_patterns(HexState *
pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_NUMBER;
pat->pattern = strdup(
"(0x[[:digit:]abcdef]+)" // Hexadecimal numbers
"0x(\\d|[abcdef])+" // Hexadecimal numbers
);
rz_pvector_push(pvec, pat);

pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_MNEMONIC;
pat->pattern = strdup(
"([[:alpha:]]+[[:digit:]]+[[:alpha:]]*)" // Mnemonics with a decimal number in the name.
"[a-zA-Z]+\\d+[a-zA-Z]*" // Mnemonics with a decimal number in the name.
);
rz_pvector_push(pvec, pat);

pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_NUMBER;
pat->pattern = strdup(
"([[:digit:]]+)" // Decimal numbers
"\\d+" // Decimal numbers
);
rz_pvector_push(pvec, pat);

pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_SEPARATOR;
pat->pattern = strdup(
"([[:blank:]]+)|" // Spaces and tabs
"([,;\\.\\(\\)\\{\\}:])" // Brackets and others
"\\s+|" // Spaces and tabs
"[,;\\.\\(\\)\\{\\}:]" // Brackets and others
);
rz_pvector_push(pvec, pat);

pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_OPERATOR;
pat->pattern = strdup(
"(\\+)|(=)|(!)|(-)" // +,-,=,],[, ! (not the packet prefix)
"[\\+=!-]" // +,-,=,],[, ! (not the packet prefix)
);
rz_pvector_push(pvec, pat);

pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_OPERATOR;
pat->pattern = strdup(
"(\\])|(\\[|<{1,2}|>{1,2})" // +,-,=,],[, ! (not the packet prefix)
"\\]|\\[|<{1,2}|>{1,2}" // +,-,=,],[, ! (not the packet prefix)
);
rz_pvector_push(pvec, pat);

pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_MNEMONIC;
pat->pattern = strdup(
"([[:alnum:]]+)|" // Alphanumeric mnemonics
"([[:alnum:]]+_[[:alnum:]]+)" // Menmonics with "_" e.g dealloc_return
"\\w+_\\w+|" // Menmonics with "_" e.g dealloc_return
"\\w+" // Alphanumeric mnemonics
);
rz_pvector_push(pvec, pat);

Expand Down
23 changes: 12 additions & 11 deletions librz/cons/less.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
// SPDX-License-Identifier: LGPL-3.0-only

#include <rz_cons.h>
#include <rz_regex.h>
#include <rz_util/rz_regex.h>
#include <rz_util.h>
#include "pager_private.h"
#include "rz_vector.h"

#define I(x) rz_cons_singleton()->x

Expand All @@ -31,7 +32,7 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) {
RzRegex *rx = NULL;
int w, h, ch, to, ui = 1, from = 0, i;
const char *sreg;
RzList **mla;
RzPVector **mla;

// rcons kills str after flushing the buffer, so we must keep a copy
char *ostr = strdup(str);
Expand All @@ -47,17 +48,14 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) {
if (lines_count < 1) {
mla = NULL;
} else {
mla = calloc(lines_count, sizeof(RzList *));
mla = calloc(lines_count, sizeof(RzPVector *));
if (!mla) {
free(p);
free(ostr);
free(lines);
return 0;
}
}
for (i = 0; i < lines_count; i++) {
mla[i] = rz_list_new();
}
rz_cons_set_raw(true);
rz_cons_show_cursor(false);
rz_cons_reset();
Expand All @@ -75,7 +73,7 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) {
ch = rz_cons_readchar();
if (exitkeys && strchr(exitkeys, ch)) {
for (i = 0; i < lines_count; i++) {
rz_list_free(mla[i]);
rz_pvector_free(mla[i]);
}
free(p);
free(mla);
Expand Down Expand Up @@ -129,7 +127,7 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) {
if (rx) {
rz_regex_free(rx);
}
rx = rz_regex_new(sreg, "");
rx = rz_regex_new(sreg, RZ_REGEX_EXTENDED | RZ_REGEX_MULTILINE, 0);
} else { /* we got an empty string */
from = pager_next_match(from, mla, lines_count);
break;
Expand All @@ -138,9 +136,12 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) {
break;
}
/* find all occurrences */
if (pager_all_matches(p, rx, mla, lines, lines_count)) {
from = pager_next_match(from, mla, lines_count);
RzPVector *matches = rz_regex_match_all_not_grouped(rx, str, RZ_REGEX_ZERO_TERMINATED, 0, RZ_REGEX_DEFAULT);
if (rz_pvector_empty(matches)) {
rz_pvector_free(matches);
break;
}
from = pager_next_match(from, mla, lines_count);
break;
case 'n': /* next match */
/* search already performed */
Expand All @@ -157,7 +158,7 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) {
}
}
for (i = 0; i < lines_count; i++) {
rz_list_free(mla[i]);
rz_pvector_free(mla[i]);
}
free(mla);
rz_regex_free(rx);
Expand Down
Loading

0 comments on commit d11a763

Please sign in to comment.