Replace current regex engine with PCRE2 (#4185)

* Replace OpenBSD regex library with PCRE2. PCRE2 has way better performance than the OpenBSD library (something around 20 times faster). The following flags are enabled for every pattern: - PCRE2_UTF - PCRE2_MATCH_INVALID_UTF - PCRE2_NO_UTF_CHECK All the others are optional. Changes made: - Adds PCRE2 as subproject. - Changes the API away from POSIX to PCRE2. - Edits many regex patterns because: - ' ' is skipped in patterns, if the EXTENDED flag is set for matching. '\s' must be set now. - '.' doesn't match newlines by default. - Changes the API so matches and their groups are bundled into PVectors. - Moves the regex component to rz_util. * Fix cross build - add copy of PCRE2 dependecy Meson currently doesn't support subprojects to be native and non-native at the same time. See: mesonbuild/meson#10947 Unfortunately, sdb depends on rz_util which in turn depends on PCRE2. Excluding PCRE2 from the native build makes linking of rz_util not possible anymore. Adding it, will make Meson complain that the dependencies cannot be mixed. Hence, we compile a copy of PCRE2 for the native build if required.
rizinorg · Feb 5, 2024 · 5afc51f · 5afc51f
1 parent df6b6a9
commit 5afc51f
Show file tree

Hide file tree

Showing 65 changed files with 1,312 additions and 5,669 deletions.
diff --git a/.gitignore b/.gitignore
@@ -117,6 +117,7 @@ peda-session-*
 .cache/
 test/.tmp/*
 subprojects/capstone-*/
+subprojects/pcre2/
 subprojects/libzip-*/
 subprojects/lz4-*/
 subprojects/packagecache/

diff --git a/.reuse/dep5 b/.reuse/dep5
@@ -188,11 +188,6 @@ Copyright: 1986-1995 Ian F. Darwin
            1995-present Christos Zoulas and others
 License: BSD-2-Clause
 
-Files: librz/util/regex/*
-Copyright: 1992, 1993, 1994 Henry Spencer
-           1992, 1993, 1994 The Regents of the University of California
-License: BSD-3-Clause
-
 Files: subprojects/rzheap/rz_jemalloc/*
 Copyright: 2002-present Jason Evans <[email protected]>
            2007-2012 Mozilla Foundation.

diff --git a/binrz/rz-test/run.c b/binrz/rz-test/run.c
@@ -2,6 +2,8 @@
 // SPDX-License-Identifier: LGPL-3.0-only
 
 #include "rz_test.h"
+#include <rz_util/rz_str.h>
+#include <rz_util/rz_regex.h>
 #include <rz_cons.h>
 
 #if __WINDOWS__
@@ -193,11 +195,16 @@ RZ_API RzSubprocessOutput *rz_test_run_cmd_test(RzTestRunConfig *config, RzCmdTe
 
 RZ_API bool rz_test_cmp_cmd_output(const char *output, const char *expect, const char *regexp) {
 	if (regexp) {
-		RzList *matches = rz_regex_get_match_list(regexp, "e", output);
-		const char *match = rz_list_to_str(matches, '\0');
-		bool equal = (0 == strcmp(expect, match));
-		rz_list_free(matches);
-		RZ_FREE(match);
+		RzStrBuf *match_str = rz_regex_full_match_str(regexp, output, RZ_REGEX_ZERO_TERMINATED, RZ_REGEX_EXTENDED, RZ_REGEX_DEFAULT, "\n");
+		bool equal = false;
+		ut32 expect_len = strlen(expect);
+		if (expect_len > 0 && expect[expect_len - 1] == '\n') {
+			// Ignore newline
+			equal = (rz_str_cmp(expect, rz_strbuf_get(match_str), expect_len - 1) == 0);
+		} else {
+			equal = RZ_STR_EQ(expect, rz_strbuf_get(match_str));
+		}
+		rz_strbuf_free(match_str);
 		return equal;
 	}
 	return (0 == strcmp(expect, output));

diff --git a/binrz/rz-test/rz-test.c b/binrz/rz-test/rz-test.c
@@ -743,9 +743,8 @@ static void print_diff(const char *actual, const char *expected, const char *reg
 	const char *output = actual;
 
 	if (regexp) {
-		RzList *matches = rz_regex_get_match_list(regexp, "e", actual);
-		output = rz_list_to_str(matches, '\0');
-		rz_list_free(matches);
+		RzStrBuf *match_str = rz_regex_full_match_str(regexp, actual, RZ_REGEX_ZERO_TERMINATED, RZ_REGEX_EXTENDED, RZ_REGEX_DEFAULT, "\n");
+		output = rz_strbuf_drain(match_str);
 	}
 
 	d = rz_diff_lines_new(expected, output, NULL);

diff --git a/librz/asm/arch/hexagon/hexagon_arch.c b/librz/asm/arch/hexagon/hexagon_arch.c
@@ -888,13 +888,17 @@ RZ_API void hexagon_reverse_opcode(const RzAsm *rz_asm, HexReversedOpcode *rz_re
 		memcpy(rz_reverse->ana_op, &hic->ana_op, sizeof(RzAnalysisOp));
 		rz_strbuf_set(&rz_reverse->asm_op->buf_asm, hic->text);
 		rz_reverse->asm_op->asm_toks = rz_asm_tokenize_asm_regex(&rz_reverse->asm_op->buf_asm, state->token_patterns);
-		rz_reverse->asm_op->asm_toks->op_type = hic->ana_op.type;
+		if (rz_reverse->asm_op->asm_toks) {
+			rz_reverse->asm_op->asm_toks->op_type = hic->ana_op.type;
+		}
 		break;
 	case HEXAGON_DISAS:
 		memcpy(rz_reverse->asm_op, &hic->asm_op, sizeof(RzAsmOp));
 		rz_strbuf_set(&rz_reverse->asm_op->buf_asm, hic->text);
 		rz_reverse->asm_op->asm_toks = rz_asm_tokenize_asm_regex(&rz_reverse->asm_op->buf_asm, state->token_patterns);
-		rz_reverse->asm_op->asm_toks->op_type = hic->ana_op.type;
+		if (rz_reverse->asm_op->asm_toks) {
+			rz_reverse->asm_op->asm_toks->op_type = hic->ana_op.type;
+		}
 		break;
 	case HEXAGON_ANALYSIS:
 		memcpy(rz_reverse->ana_op, &hic->ana_op, sizeof(RzAnalysisOp));

diff --git a/librz/asm/asm.c b/librz/asm/asm.c
@@ -5,7 +5,7 @@
 #include "rz_util/rz_print.h"
 #include <rz_vector.h>
 #include <rz_util/rz_strbuf.h>
-#include <rz_regex.h>
+#include <rz_util/rz_regex.h>
 #include <rz_util/rz_assert.h>
 #include <rz_list.h>
 #include <stdio.h>
@@ -1545,7 +1545,7 @@ RZ_API void rz_asm_compile_token_patterns(RZ_INOUT RzPVector /*<RzAsmTokenPatter
 	rz_pvector_foreach (patterns, it) {
 		RzAsmTokenPattern *pat = *it;
 		if (!pat->regex) {
-			pat->regex = rz_regex_new(pat->pattern, "e");
+			pat->regex = rz_regex_new(pat->pattern, RZ_REGEX_EXTENDED, 0);
 			if (!pat->regex) {
 				RZ_LOG_WARN("Did not compile regex pattern %s.\n", pat->pattern);
 				rz_warn_if_reached();
@@ -1584,32 +1584,31 @@ RZ_API RZ_OWN RzAsmTokenString *rz_asm_tokenize_asm_regex(RZ_BORROW RzStrBuf *as
 			}
 		}
 
-		/// Start pattern search from the beginning
-		size_t asm_str_off = 0;
-
 		// Search for token pattern.
-		RzRegexMatch match[1];
-		while (rz_regex_exec(pattern->regex, asm_str + asm_str_off, 1, match, 0) == 0) {
-			st64 match_start = match[0].rm_so; // Token start
-			st64 match_end = match[0].rm_eo; // Token end
-			st64 len = match_end - match_start; // Length of token
-			st64 tok_offset = asm_str_off + match_start; // Token offset in str
+		RzPVector *match_sets = rz_regex_match_all(pattern->regex, asm_str, RZ_REGEX_ZERO_TERMINATED, 0, RZ_REGEX_DEFAULT);
+		void **grouped_match;
+		rz_pvector_foreach (match_sets, grouped_match) {
+			if (rz_pvector_empty(*grouped_match)) {
+				continue;
+			}
+			RzRegexMatch *match = rz_pvector_at(*grouped_match, 0);
+			st64 match_start = match->start; // Token start
+			st64 len = match->len; // Length of token
+			st64 tok_offset = match_start; // Token offset in str
 			if (overlaps_with_token(toks->tokens, tok_offset, tok_offset + len - 1)) {
 				// If this is true a token with higher priority was matched before.
-				asm_str_off = tok_offset + len;
 				continue;
 			}
 
 			// New token found, add it.
 			if (!is_num(asm_str + tok_offset)) {
 				add_token(toks, tok_offset, len, pattern->type, 0);
-				asm_str_off = tok_offset + len;
 				continue;
 			}
 			ut64 number = strtoull(asm_str + tok_offset, NULL, 0);
 			add_token(toks, tok_offset, len, pattern->type, number);
-			asm_str_off = tok_offset + len;
 		}
+		rz_pvector_free(match_sets);
 	}
 
 	rz_vector_sort(toks->tokens, (RzVectorComparator)cmp_tokens, false, NULL);

diff --git a/librz/asm/p/asm_bf.c b/librz/asm/p/asm_bf.c
@@ -30,22 +30,22 @@ static RZ_OWN RzPVector /*<RzAsmTokenPattern *>*/ *get_token_patterns(RzAsm *a)
 	pat = RZ_NEW0(RzAsmTokenPattern);
 	pat->type = RZ_ASM_TOKEN_REGISTER;
 	pat->pattern = strdup(
-		"(ptr)");
+		"ptr");
 	rz_pvector_push(pvec, pat);
 
 	// reference pattern
 	pat = RZ_NEW0(RzAsmTokenPattern);
 	pat->type = RZ_ASM_TOKEN_OPERATOR;
 	pat->pattern = strdup(
-		"(\\[)|(\\])" // Matches a single bracket
+		"\\[|\\]" // Matches a single bracket
 	);
 	rz_pvector_push(pvec, pat);
 
 	// Separator pattern
 	pat = RZ_NEW0(RzAsmTokenPattern);
 	pat->type = RZ_ASM_TOKEN_SEPARATOR;
 	pat->pattern = strdup(
-		"([[:blank:]]+)");
+		"\\s+");
 	rz_pvector_push(pvec, pat);
 
 	return pvec;

diff --git a/librz/asm/p/asm_hexagon.c b/librz/asm/p/asm_hexagon.c
@@ -30,23 +30,23 @@ static RZ_OWN RzPVector /*<RzAsmTokenPattern *>*/ *get_token_patterns(HexState *
 	RzAsmTokenPattern *pat = RZ_NEW0(RzAsmTokenPattern);
 	pat->type = RZ_ASM_TOKEN_META;
 	pat->pattern = strdup(
-		"(^[\\[\\?\\/\\|\\\\\\{])|(┌)|(│)|(└)|" // Packet prefix
-		"((∎)|[<\\}])([ :])(endloop[01]{1,2})" // Endloop markers
+		"^[\\[\\?\\/\\|\\\\\\{┌│└]|" // Packet prefix
+		"(∎|[<\\}])[\\s:]endloop[01]{1,2}" // Endloop markers
 	);
 	rz_pvector_push(pvec, pat);
 
 	pat = RZ_NEW0(RzAsmTokenPattern);
 	pat->type = RZ_ASM_TOKEN_META;
 	pat->pattern = strdup(
-		"(#{1,2})|(\\}$)|" // Immediate prefix, Closing packet bracket
+		"\\#{1,2}|\\}$|" // Immediate prefix, Closing packet bracket
 		"\\.new|:n?t|:raw|<err>" // .new and jump hints
 	);
 	rz_pvector_push(pvec, pat);
 
 	pat = RZ_NEW0(RzAsmTokenPattern);
 	pat->type = RZ_ASM_TOKEN_REGISTER;
 	pat->pattern = strdup(
-		"([CNPRMQVO][[:digit:]]{1,2}(:[[:digit:]]{1,2})?(in)?)" // Registers and double registers
+		"[CNPRMQVO]\\d{1,2}(:\\d{1,2})?(in)?" // Registers and double registers
 	);
 	rz_pvector_push(pvec, pat);
 
@@ -60,51 +60,51 @@ static RZ_OWN RzPVector /*<RzAsmTokenPattern *>*/ *get_token_patterns(HexState *
 	pat = RZ_NEW0(RzAsmTokenPattern);
 	pat->type = RZ_ASM_TOKEN_NUMBER;
 	pat->pattern = strdup(
-		"(0x[[:digit:]abcdef]+)" // Hexadecimal numbers
+		"0x(\\d|[abcdef])+" // Hexadecimal numbers
 	);
 	rz_pvector_push(pvec, pat);
 
 	pat = RZ_NEW0(RzAsmTokenPattern);
 	pat->type = RZ_ASM_TOKEN_MNEMONIC;
 	pat->pattern = strdup(
-		"([[:alpha:]]+[[:digit:]]+[[:alpha:]]*)" // Mnemonics with a decimal number in the name.
+		"[a-zA-Z]+\\d+[a-zA-Z]*" // Mnemonics with a decimal number in the name.
 	);
 	rz_pvector_push(pvec, pat);
 
 	pat = RZ_NEW0(RzAsmTokenPattern);
 	pat->type = RZ_ASM_TOKEN_NUMBER;
 	pat->pattern = strdup(
-		"([[:digit:]]+)" // Decimal numbers
+		"\\d+" // Decimal numbers
 	);
 	rz_pvector_push(pvec, pat);
 
 	pat = RZ_NEW0(RzAsmTokenPattern);
 	pat->type = RZ_ASM_TOKEN_SEPARATOR;
 	pat->pattern = strdup(
-		"([[:blank:]]+)|" // Spaces and tabs
-		"([,;\\.\\(\\)\\{\\}:])" // Brackets and others
+		"\\s+|" // Spaces and tabs
+		"[,;\\.\\(\\)\\{\\}:]" // Brackets and others
 	);
 	rz_pvector_push(pvec, pat);
 
 	pat = RZ_NEW0(RzAsmTokenPattern);
 	pat->type = RZ_ASM_TOKEN_OPERATOR;
 	pat->pattern = strdup(
-		"(\\+)|(=)|(!)|(-)" // +,-,=,],[, ! (not the packet prefix)
+		"[\\+=!-]" // +,-,=,],[, ! (not the packet prefix)
 	);
 	rz_pvector_push(pvec, pat);
 
 	pat = RZ_NEW0(RzAsmTokenPattern);
 	pat->type = RZ_ASM_TOKEN_OPERATOR;
 	pat->pattern = strdup(
-		"(\\])|(\\[|<{1,2}|>{1,2})" // +,-,=,],[, ! (not the packet prefix)
+		"\\]|\\[|<{1,2}|>{1,2}" // +,-,=,],[, ! (not the packet prefix)
 	);
 	rz_pvector_push(pvec, pat);
 
 	pat = RZ_NEW0(RzAsmTokenPattern);
 	pat->type = RZ_ASM_TOKEN_MNEMONIC;
 	pat->pattern = strdup(
-		"([[:alnum:]]+)|" // Alphanumeric mnemonics
-		"([[:alnum:]]+_[[:alnum:]]+)" // Menmonics with "_" e.g dealloc_return
+		"\\w+_\\w+|" // Menmonics with "_" e.g dealloc_return
+		"\\w+" // Alphanumeric mnemonics
 	);
 	rz_pvector_push(pvec, pat);
 

diff --git a/librz/cons/less.c b/librz/cons/less.c
@@ -3,9 +3,10 @@
 // SPDX-License-Identifier: LGPL-3.0-only
 
 #include <rz_cons.h>
-#include <rz_regex.h>
+#include <rz_util/rz_regex.h>
 #include <rz_util.h>
 #include "pager_private.h"
+#include "rz_vector.h"
 
 #define I(x) rz_cons_singleton()->x
 
@@ -31,7 +32,7 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) {
 	RzRegex *rx = NULL;
 	int w, h, ch, to, ui = 1, from = 0, i;
 	const char *sreg;
-	RzList **mla;
+	RzPVector **mla;
 
 	// rcons kills str after flushing the buffer, so we must keep a copy
 	char *ostr = strdup(str);
@@ -47,17 +48,14 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) {
 	if (lines_count < 1) {
 		mla = NULL;
 	} else {
-		mla = calloc(lines_count, sizeof(RzList *));
+		mla = calloc(lines_count, sizeof(RzPVector *));
 		if (!mla) {
 			free(p);
 			free(ostr);
 			free(lines);
 			return 0;
 		}
 	}
-	for (i = 0; i < lines_count; i++) {
-		mla[i] = rz_list_new();
-	}
 	rz_cons_set_raw(true);
 	rz_cons_show_cursor(false);
 	rz_cons_reset();
@@ -75,7 +73,7 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) {
 		ch = rz_cons_readchar();
 		if (exitkeys && strchr(exitkeys, ch)) {
 			for (i = 0; i < lines_count; i++) {
-				rz_list_free(mla[i]);
+				rz_pvector_free(mla[i]);
 			}
 			free(p);
 			free(mla);
@@ -129,7 +127,7 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) {
 				if (rx) {
 					rz_regex_free(rx);
 				}
-				rx = rz_regex_new(sreg, "");
+				rx = rz_regex_new(sreg, RZ_REGEX_EXTENDED | RZ_REGEX_MULTILINE, 0);
 			} else { /* we got an empty string */
 				from = pager_next_match(from, mla, lines_count);
 				break;
@@ -138,9 +136,12 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) {
 				break;
 			}
 			/* find all occurrences */
-			if (pager_all_matches(p, rx, mla, lines, lines_count)) {
-				from = pager_next_match(from, mla, lines_count);
+			RzPVector *matches = rz_regex_match_all_not_grouped(rx, str, RZ_REGEX_ZERO_TERMINATED, 0, RZ_REGEX_DEFAULT);
+			if (rz_pvector_empty(matches)) {
+				rz_pvector_free(matches);
+				break;
 			}
+			from = pager_next_match(from, mla, lines_count);
 			break;
 		case 'n': /* next match */
 			/* search already performed */
@@ -157,7 +158,7 @@ RZ_API int rz_cons_less_str(const char *str, const char *exitkeys) {
 		}
 	}
 	for (i = 0; i < lines_count; i++) {
-		rz_list_free(mla[i]);
+		rz_pvector_free(mla[i]);
 	}
 	free(mla);
 	rz_regex_free(rx);