Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace current regex engine with PCRE2 #4185

Merged
merged 3 commits into from
Feb 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ peda-session-*
.cache/
test/.tmp/*
subprojects/capstone-*/
subprojects/pcre2/
subprojects/libzip-*/
subprojects/lz4-*/
subprojects/packagecache/
Expand Down
5 changes: 0 additions & 5 deletions .reuse/dep5
Original file line number Diff line number Diff line change
Expand Up @@ -188,11 +188,6 @@ Copyright: 1986-1995 Ian F. Darwin
1995-present Christos Zoulas and others
License: BSD-2-Clause

Files: librz/util/regex/*
Copyright: 1992, 1993, 1994 Henry Spencer
1992, 1993, 1994 The Regents of the University of California
License: BSD-3-Clause

Files: subprojects/rzheap/rz_jemalloc/*
Copyright: 2002-present Jason Evans <[email protected]>
2007-2012 Mozilla Foundation.
Expand Down
17 changes: 12 additions & 5 deletions binrz/rz-test/run.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
// SPDX-License-Identifier: LGPL-3.0-only

#include "rz_test.h"
#include <rz_util/rz_str.h>
#include <rz_util/rz_regex.h>
#include <rz_cons.h>

#if __WINDOWS__
Expand Down Expand Up @@ -193,11 +195,16 @@ RZ_API RzSubprocessOutput *rz_test_run_cmd_test(RzTestRunConfig *config, RzCmdTe

RZ_API bool rz_test_cmp_cmd_output(const char *output, const char *expect, const char *regexp) {
if (regexp) {
RzList *matches = rz_regex_get_match_list(regexp, "e", output);
const char *match = rz_list_to_str(matches, '\0');
bool equal = (0 == strcmp(expect, match));
rz_list_free(matches);
RZ_FREE(match);
RzStrBuf *match_str = rz_regex_full_match_str(regexp, output, RZ_REGEX_ZERO_TERMINATED, RZ_REGEX_EXTENDED, RZ_REGEX_DEFAULT, "\n");
bool equal = false;
ut32 expect_len = strlen(expect);
if (expect_len > 0 && expect[expect_len - 1] == '\n') {
// Ignore newline
equal = (rz_str_cmp(expect, rz_strbuf_get(match_str), expect_len - 1) == 0);
} else {
equal = RZ_STR_EQ(expect, rz_strbuf_get(match_str));
}
rz_strbuf_free(match_str);
return equal;
}
return (0 == strcmp(expect, output));
Expand Down
5 changes: 2 additions & 3 deletions binrz/rz-test/rz-test.c
Original file line number Diff line number Diff line change
Expand Up @@ -743,9 +743,8 @@ static void print_diff(const char *actual, const char *expected, const char *reg
const char *output = actual;

if (regexp) {
RzList *matches = rz_regex_get_match_list(regexp, "e", actual);
output = rz_list_to_str(matches, '\0');
rz_list_free(matches);
RzStrBuf *match_str = rz_regex_full_match_str(regexp, actual, RZ_REGEX_ZERO_TERMINATED, RZ_REGEX_EXTENDED, RZ_REGEX_DEFAULT, "\n");
output = rz_strbuf_drain(match_str);
}

d = rz_diff_lines_new(expected, output, NULL);
Expand Down
8 changes: 6 additions & 2 deletions librz/asm/arch/hexagon/hexagon_arch.c
Original file line number Diff line number Diff line change
Expand Up @@ -888,13 +888,17 @@ RZ_API void hexagon_reverse_opcode(const RzAsm *rz_asm, HexReversedOpcode *rz_re
memcpy(rz_reverse->ana_op, &hic->ana_op, sizeof(RzAnalysisOp));
rz_strbuf_set(&rz_reverse->asm_op->buf_asm, hic->text);
rz_reverse->asm_op->asm_toks = rz_asm_tokenize_asm_regex(&rz_reverse->asm_op->buf_asm, state->token_patterns);
rz_reverse->asm_op->asm_toks->op_type = hic->ana_op.type;
if (rz_reverse->asm_op->asm_toks) {
rz_reverse->asm_op->asm_toks->op_type = hic->ana_op.type;
}
break;
case HEXAGON_DISAS:
memcpy(rz_reverse->asm_op, &hic->asm_op, sizeof(RzAsmOp));
rz_strbuf_set(&rz_reverse->asm_op->buf_asm, hic->text);
rz_reverse->asm_op->asm_toks = rz_asm_tokenize_asm_regex(&rz_reverse->asm_op->buf_asm, state->token_patterns);
rz_reverse->asm_op->asm_toks->op_type = hic->ana_op.type;
if (rz_reverse->asm_op->asm_toks) {
rz_reverse->asm_op->asm_toks->op_type = hic->ana_op.type;
}
break;
case HEXAGON_ANALYSIS:
memcpy(rz_reverse->ana_op, &hic->ana_op, sizeof(RzAnalysisOp));
Expand Down
27 changes: 13 additions & 14 deletions librz/asm/asm.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#include "rz_util/rz_print.h"
#include <rz_vector.h>
#include <rz_util/rz_strbuf.h>
#include <rz_regex.h>
#include <rz_util/rz_regex.h>
#include <rz_util/rz_assert.h>
#include <rz_list.h>
#include <stdio.h>
Expand Down Expand Up @@ -1545,7 +1545,7 @@ RZ_API void rz_asm_compile_token_patterns(RZ_INOUT RzPVector /*<RzAsmTokenPatter
rz_pvector_foreach (patterns, it) {
RzAsmTokenPattern *pat = *it;
if (!pat->regex) {
pat->regex = rz_regex_new(pat->pattern, "e");
pat->regex = rz_regex_new(pat->pattern, RZ_REGEX_EXTENDED, 0);
if (!pat->regex) {
RZ_LOG_WARN("Did not compile regex pattern %s.\n", pat->pattern);
rz_warn_if_reached();
Expand Down Expand Up @@ -1584,32 +1584,31 @@ RZ_API RZ_OWN RzAsmTokenString *rz_asm_tokenize_asm_regex(RZ_BORROW RzStrBuf *as
}
}

/// Start pattern search from the beginning
size_t asm_str_off = 0;

// Search for token pattern.
RzRegexMatch match[1];
while (rz_regex_exec(pattern->regex, asm_str + asm_str_off, 1, match, 0) == 0) {
st64 match_start = match[0].rm_so; // Token start
st64 match_end = match[0].rm_eo; // Token end
st64 len = match_end - match_start; // Length of token
st64 tok_offset = asm_str_off + match_start; // Token offset in str
RzPVector *match_sets = rz_regex_match_all(pattern->regex, asm_str, RZ_REGEX_ZERO_TERMINATED, 0, RZ_REGEX_DEFAULT);
void **grouped_match;
rz_pvector_foreach (match_sets, grouped_match) {
if (rz_pvector_empty(*grouped_match)) {
continue;
}
RzRegexMatch *match = rz_pvector_at(*grouped_match, 0);
st64 match_start = match->start; // Token start
st64 len = match->len; // Length of token
st64 tok_offset = match_start; // Token offset in str
if (overlaps_with_token(toks->tokens, tok_offset, tok_offset + len - 1)) {
// If this is true a token with higher priority was matched before.
asm_str_off = tok_offset + len;
continue;
}

// New token found, add it.
if (!is_num(asm_str + tok_offset)) {
add_token(toks, tok_offset, len, pattern->type, 0);
asm_str_off = tok_offset + len;
continue;
}
ut64 number = strtoull(asm_str + tok_offset, NULL, 0);
add_token(toks, tok_offset, len, pattern->type, number);
asm_str_off = tok_offset + len;
}
rz_pvector_free(match_sets);
}

rz_vector_sort(toks->tokens, (RzVectorComparator)cmp_tokens, false, NULL);
Expand Down
6 changes: 3 additions & 3 deletions librz/asm/p/asm_bf.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,22 +30,22 @@ static RZ_OWN RzPVector /*<RzAsmTokenPattern *>*/ *get_token_patterns(RzAsm *a)
pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_REGISTER;
pat->pattern = strdup(
"(ptr)");
"ptr");
rz_pvector_push(pvec, pat);

// reference pattern
pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_OPERATOR;
pat->pattern = strdup(
"(\\[)|(\\])" // Matches a single bracket
"\\[|\\]" // Matches a single bracket
);
rz_pvector_push(pvec, pat);

// Separator pattern
pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_SEPARATOR;
pat->pattern = strdup(
"([[:blank:]]+)");
"\\s+");
rz_pvector_push(pvec, pat);

return pvec;
Expand Down
26 changes: 13 additions & 13 deletions librz/asm/p/asm_hexagon.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,23 +30,23 @@ static RZ_OWN RzPVector /*<RzAsmTokenPattern *>*/ *get_token_patterns(HexState *
RzAsmTokenPattern *pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_META;
pat->pattern = strdup(
"(^[\\[\\?\\/\\|\\\\\\{])|(┌)|(│)|(└)|" // Packet prefix
"((∎)|[<\\}])([ :])(endloop[01]{1,2})" // Endloop markers
"^[\\[\\?\\/\\|\\\\\\{┌│└]|" // Packet prefix
"(|[<\\}])[\\s:]endloop[01]{1,2}" // Endloop markers
);
rz_pvector_push(pvec, pat);

pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_META;
pat->pattern = strdup(
"(#{1,2})|(\\}$)|" // Immediate prefix, Closing packet bracket
"\\#{1,2}|\\}$|" // Immediate prefix, Closing packet bracket
"\\.new|:n?t|:raw|<err>" // .new and jump hints
);
rz_pvector_push(pvec, pat);

pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_REGISTER;
pat->pattern = strdup(
"([CNPRMQVO][[:digit:]]{1,2}(:[[:digit:]]{1,2})?(in)?)" // Registers and double registers
"[CNPRMQVO]\\d{1,2}(:\\d{1,2})?(in)?" // Registers and double registers
);
rz_pvector_push(pvec, pat);

Expand All @@ -60,51 +60,51 @@ static RZ_OWN RzPVector /*<RzAsmTokenPattern *>*/ *get_token_patterns(HexState *
pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_NUMBER;
pat->pattern = strdup(
"(0x[[:digit:]abcdef]+)" // Hexadecimal numbers
"0x(\\d|[abcdef])+" // Hexadecimal numbers
);
rz_pvector_push(pvec, pat);

pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_MNEMONIC;
pat->pattern = strdup(
"([[:alpha:]]+[[:digit:]]+[[:alpha:]]*)" // Mnemonics with a decimal number in the name.
"[a-zA-Z]+\\d+[a-zA-Z]*" // Mnemonics with a decimal number in the name.
);
rz_pvector_push(pvec, pat);

pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_NUMBER;
pat->pattern = strdup(
"([[:digit:]]+)" // Decimal numbers
"\\d+" // Decimal numbers
);
rz_pvector_push(pvec, pat);

pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_SEPARATOR;
pat->pattern = strdup(
"([[:blank:]]+)|" // Spaces and tabs
"([,;\\.\\(\\)\\{\\}:])" // Brackets and others
"\\s+|" // Spaces and tabs
"[,;\\.\\(\\)\\{\\}:]" // Brackets and others
);
rz_pvector_push(pvec, pat);

pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_OPERATOR;
pat->pattern = strdup(
"(\\+)|(=)|(!)|(-)" // +,-,=,],[, ! (not the packet prefix)
"[\\+=!-]" // +,-,=,],[, ! (not the packet prefix)
);
rz_pvector_push(pvec, pat);

pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_OPERATOR;
pat->pattern = strdup(
"(\\])|(\\[|<{1,2}|>{1,2})" // +,-,=,],[, ! (not the packet prefix)
"\\]|\\[|<{1,2}|>{1,2}" // +,-,=,],[, ! (not the packet prefix)
);
rz_pvector_push(pvec, pat);

pat = RZ_NEW0(RzAsmTokenPattern);
pat->type = RZ_ASM_TOKEN_MNEMONIC;
pat->pattern = strdup(
"([[:alnum:]]+)|" // Alphanumeric mnemonics
"([[:alnum:]]+_[[:alnum:]]+)" // Menmonics with "_" e.g dealloc_return
"\\w+_\\w+|" // Menmonics with "_" e.g dealloc_return
"\\w+" // Alphanumeric mnemonics
);
rz_pvector_push(pvec, pat);

Expand Down
23 changes: 12 additions & 11 deletions librz/cons/less.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
// SPDX-License-Identifier: LGPL-3.0-only

#include <rz_cons.h>
#include <rz_regex.h>
#include <rz_util/rz_regex.h>
#include <rz_util.h>
Rot127 marked this conversation as resolved.
Show resolved Hide resolved
#include "pager_private.h"
#include "rz_vector.h"
Rot127 marked this conversation as resolved.
Show resolved Hide resolved

#define I(x) rz_cons_singleton()->x

Expand All @@ -31,7 +32,7 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) {
RzRegex *rx = NULL;
int w, h, ch, to, ui = 1, from = 0, i;
const char *sreg;
RzList **mla;
RzPVector **mla;

// rcons kills str after flushing the buffer, so we must keep a copy
char *ostr = strdup(str);
Expand All @@ -47,17 +48,14 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) {
if (lines_count < 1) {
mla = NULL;
} else {
mla = calloc(lines_count, sizeof(RzList *));
mla = calloc(lines_count, sizeof(RzPVector *));
if (!mla) {
free(p);
free(ostr);
free(lines);
return 0;
}
}
for (i = 0; i < lines_count; i++) {
mla[i] = rz_list_new();
}
rz_cons_set_raw(true);
rz_cons_show_cursor(false);
rz_cons_reset();
Expand All @@ -75,7 +73,7 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) {
ch = rz_cons_readchar();
if (exitkeys && strchr(exitkeys, ch)) {
for (i = 0; i < lines_count; i++) {
rz_list_free(mla[i]);
rz_pvector_free(mla[i]);
}
free(p);
free(mla);
Expand Down Expand Up @@ -129,7 +127,7 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) {
if (rx) {
rz_regex_free(rx);
}
rx = rz_regex_new(sreg, "");
rx = rz_regex_new(sreg, RZ_REGEX_EXTENDED | RZ_REGEX_MULTILINE, 0);
} else { /* we got an empty string */
from = pager_next_match(from, mla, lines_count);
break;
Expand All @@ -138,9 +136,12 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) {
break;
}
/* find all occurrences */
if (pager_all_matches(p, rx, mla, lines, lines_count)) {
from = pager_next_match(from, mla, lines_count);
RzPVector *matches = rz_regex_match_all_not_grouped(rx, str, RZ_REGEX_ZERO_TERMINATED, 0, RZ_REGEX_DEFAULT);
if (rz_pvector_empty(matches)) {
rz_pvector_free(matches);
break;
}
from = pager_next_match(from, mla, lines_count);
break;
case 'n': /* next match */
/* search already performed */
Expand All @@ -157,7 +158,7 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) {
}
}
for (i = 0; i < lines_count; i++) {
rz_list_free(mla[i]);
rz_pvector_free(mla[i]);
}
free(mla);
rz_regex_free(rx);
Expand Down
Loading
Loading