From 1c309a82499fc64fc490fcefb5da18abbfde7f6d Mon Sep 17 00:00:00 2001 From: secDre4mer <61268450+secDre4mer@users.noreply.github.com> Date: Fri, 30 Jun 2023 12:48:33 +0200 Subject: [PATCH] feat: skip bytecode evaluation for some rules without string matches (#1927) * feat: skip bytecode evaluation for some rules without string matches Optimize a common case where YARA conditions are formed like e.g. "... and 1 of them and ...", in other words, requiring a string match to ever be true. By noting these cases and recording in a bitmap if a string match occurred, the condition evaluation for these rules can be skipped entirely in most cases. * feat: only initialize bitmap once at rules load time * fix: handle calloc fail --- libyara/exec.c | 5 +- libyara/grammar.y | 91 +++++++++++++++++++++++++++++++++ libyara/include/yara/compiler.h | 6 +++ libyara/include/yara/types.h | 10 ++++ libyara/rules.c | 21 ++++++++ libyara/scan.c | 4 ++ libyara/scanner.c | 15 ++++++ 7 files changed, 150 insertions(+), 2 deletions(-) diff --git a/libyara/exec.c b/libyara/exec.c index c4eeb7effe..4049ae3f70 100644 --- a/libyara/exec.c +++ b/libyara/exec.c @@ -1167,11 +1167,12 @@ int yr_execute_code(YR_SCAN_CONTEXT* context) current_rule = &context->rules->rules_table[current_rule_idx]; // If the rule is disabled let's skip its code. - ip = jmp_if(RULE_IS_DISABLED(current_rule), ip); + bool disabled = RULE_IS_DISABLED(current_rule) || yr_bitmask_is_not_set(context->rule_evaluate_condition_flags, current_rule_idx); + ip = jmp_if(disabled, ip); // Skip the bytes corresponding to the rule's index, but only if not // taking the jump. - if (!RULE_IS_DISABLED(current_rule)) + if (!disabled) ip += sizeof(uint32_t); break; diff --git a/libyara/grammar.y b/libyara/grammar.y index 2c1cda2f82..547e7f640d 100644 --- a/libyara/grammar.y +++ b/libyara/grammar.y @@ -304,6 +304,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %type regexp %type for_expression %type for_quantifier +%type condition %type arguments @@ -405,6 +406,10 @@ rule } condition '}' { + YR_RULE* rule = (YR_RULE*) yr_arena_ref_to_ptr( + compiler->arena, &$4); + rule->required_strings = $10.required_strings.count; + int result = yr_parser_reduce_rule_declaration_phase_2( yyscanner, &$4); // rule created in phase 1 @@ -455,6 +460,9 @@ strings condition : _CONDITION_ ':' boolean_expression + { + $$ = $3; + } ; @@ -992,6 +1000,7 @@ identifier $$.type = EXPRESSION_TYPE_BOOLEAN; $$.value.integer = YR_UNDEFINED; $$.identifier.ptr = NULL; + $$.required_strings.count = 0; } else { @@ -1311,6 +1320,14 @@ boolean_expression fail_if_error(yr_parser_emit( yyscanner, OP_STR_TO_BOOL, NULL)); } + if ($1.type != EXPRESSION_TYPE_BOOLEAN) + { + $$.required_strings.count = 0; + } + else + { + $$.required_strings.count = $1.required_strings.count; + } $$.type = EXPRESSION_TYPE_BOOLEAN; } @@ -1322,12 +1339,14 @@ expression fail_if_error(yr_parser_emit_push_const(yyscanner, 1)); $$.type = EXPRESSION_TYPE_BOOLEAN; + $$.required_strings.count = 0; } | _FALSE_ { fail_if_error(yr_parser_emit_push_const(yyscanner, 0)); $$.type = EXPRESSION_TYPE_BOOLEAN; + $$.required_strings.count = 0; } | primary_expression _MATCHES_ regexp { @@ -1340,6 +1359,7 @@ expression NULL)); $$.type = EXPRESSION_TYPE_BOOLEAN; + $$.required_strings.count = 0; } | primary_expression _CONTAINS_ primary_expression { @@ -1350,6 +1370,7 @@ expression yyscanner, OP_CONTAINS, NULL)); $$.type = EXPRESSION_TYPE_BOOLEAN; + $$.required_strings.count = 0; } | primary_expression _ICONTAINS_ primary_expression { @@ -1360,6 +1381,7 @@ expression yyscanner, OP_ICONTAINS, NULL)); $$.type = EXPRESSION_TYPE_BOOLEAN; + $$.required_strings.count = 0; } | primary_expression _STARTSWITH_ primary_expression { @@ -1370,6 +1392,7 @@ expression yyscanner, OP_STARTSWITH, NULL)); $$.type = EXPRESSION_TYPE_BOOLEAN; + $$.required_strings.count = 0; } | primary_expression _ISTARTSWITH_ primary_expression { @@ -1380,6 +1403,7 @@ expression yyscanner, OP_ISTARTSWITH, NULL)); $$.type = EXPRESSION_TYPE_BOOLEAN; + $$.required_strings.count = 0; } | primary_expression _ENDSWITH_ primary_expression { @@ -1390,6 +1414,7 @@ expression yyscanner, OP_ENDSWITH, NULL)); $$.type = EXPRESSION_TYPE_BOOLEAN; + $$.required_strings.count = 0; } | primary_expression _IENDSWITH_ primary_expression { @@ -1400,6 +1425,7 @@ expression yyscanner, OP_IENDSWITH, NULL)); $$.type = EXPRESSION_TYPE_BOOLEAN; + $$.required_strings.count = 0; } | primary_expression _IEQUALS_ primary_expression { @@ -1410,6 +1436,7 @@ expression yyscanner, OP_IEQUALS, NULL)); $$.type = EXPRESSION_TYPE_BOOLEAN; + $$.required_strings.count = 0; } | _STRING_IDENTIFIER_ { @@ -1424,6 +1451,7 @@ expression fail_if_error(result); $$.type = EXPRESSION_TYPE_BOOLEAN; + $$.required_strings.count = 1; } | _STRING_IDENTIFIER_ _AT_ primary_expression { @@ -1438,6 +1466,7 @@ expression fail_if_error(result); + $$.required_strings.count = 1; $$.type = EXPRESSION_TYPE_BOOLEAN; } | _STRING_IDENTIFIER_ _IN_ range @@ -1449,6 +1478,7 @@ expression fail_if_error(result); + $$.required_strings.count = 1; $$.type = EXPRESSION_TYPE_BOOLEAN; } | _FOR_ for_expression error @@ -1691,6 +1721,7 @@ expression compiler->loop_index--; $$.type = EXPRESSION_TYPE_BOOLEAN; + $$.required_strings.count = 0; } | for_expression _OF_ string_set { @@ -1699,6 +1730,18 @@ expression yywarning(yyscanner, "expression always false - requesting %" PRId64 " of %" PRId64 ".", $1.value.integer, $3); } + + if (($1.type == EXPRESSION_TYPE_INTEGER && $1.value.integer > 0) || + ($1.type == EXPRESSION_TYPE_QUANTIFIER && + ($1.value.integer == FOR_EXPRESSION_ALL || $1.value.integer == FOR_EXPRESSION_ANY))) + { + $$.required_strings.count = 1; + } + else + { + $$.required_strings.count = 0; + } + yr_parser_emit_with_arg(yyscanner, OP_OF, OF_STRING_SET, NULL, NULL); $$.type = EXPRESSION_TYPE_BOOLEAN; @@ -1713,6 +1756,7 @@ expression yr_parser_emit_with_arg(yyscanner, OP_OF, OF_RULE_SET, NULL, NULL); $$.type = EXPRESSION_TYPE_BOOLEAN; + $$.required_strings.count = 0; } | primary_expression '%' _OF_ string_set { @@ -1731,6 +1775,15 @@ expression fail_with_error(ERROR_INVALID_PERCENTAGE); } + if (!IS_UNDEFINED($1.value.integer)) + { + $$.required_strings.count = 1; + } + else + { + $$.required_strings.count = 0; + } + yr_parser_emit_with_arg(yyscanner, OP_OF_PERCENT, OF_STRING_SET, NULL, NULL); } | primary_expression '%' _OF_ rule_set @@ -1760,6 +1813,17 @@ expression "expression always false - requesting %" PRId64 " of %" PRId64 ".", $1.value.integer, $3); } + if (($1.type == EXPRESSION_TYPE_INTEGER && $1.value.integer > 0) || + ($1.type == EXPRESSION_TYPE_QUANTIFIER && + ($1.value.integer == FOR_EXPRESSION_ALL || $1.value.integer == FOR_EXPRESSION_ANY))) + { + $$.required_strings.count = 1; + } + else + { + $$.required_strings.count = 0; + } + yr_parser_emit(yyscanner, OP_OF_FOUND_IN, NULL); $$.type = EXPRESSION_TYPE_BOOLEAN; @@ -1797,6 +1861,17 @@ expression "multiple strings at an offset is usually false."); } + if (($1.type == EXPRESSION_TYPE_INTEGER && $1.value.integer > 0) || + ($1.type == EXPRESSION_TYPE_QUANTIFIER && + ($1.value.integer == FOR_EXPRESSION_ALL || $1.value.integer == FOR_EXPRESSION_ANY))) + { + $$.required_strings.count = 1; + } + else + { + $$.required_strings.count = 0; + } + yr_parser_emit(yyscanner, OP_OF_FOUND_AT, NULL); $$.type = EXPRESSION_TYPE_BOOLEAN; @@ -1806,11 +1881,13 @@ expression yr_parser_emit(yyscanner, OP_NOT, NULL); $$.type = EXPRESSION_TYPE_BOOLEAN; + $$.required_strings.count = 0; } | _DEFINED_ boolean_expression { yr_parser_emit(yyscanner, OP_DEFINED, NULL); $$.type = EXPRESSION_TYPE_BOOLEAN; + $$.required_strings.count = 0; } | boolean_expression _AND_ { @@ -1856,6 +1933,7 @@ expression yr_free(fixup); $$.type = EXPRESSION_TYPE_BOOLEAN; + $$.required_strings.count = $4.required_strings.count + $1.required_strings.count; } | boolean_expression _OR_ { @@ -1900,6 +1978,13 @@ expression yr_free(fixup); $$.type = EXPRESSION_TYPE_BOOLEAN; + + // Set required string count to minimum from both parts + if ($1.required_strings.count > $4.required_strings.count) { + $$.required_strings.count = $4.required_strings.count; + } else { + $$.required_strings.count = $1.required_strings.count; + } } | primary_expression _LT_ primary_expression { @@ -1907,6 +1992,7 @@ expression yyscanner, "<", $1, $3)); $$.type = EXPRESSION_TYPE_BOOLEAN; + $$.required_strings.count = 0; } | primary_expression _GT_ primary_expression { @@ -1914,6 +2000,7 @@ expression yyscanner, ">", $1, $3)); $$.type = EXPRESSION_TYPE_BOOLEAN; + $$.required_strings.count = 0; } | primary_expression _LE_ primary_expression { @@ -1921,6 +2008,7 @@ expression yyscanner, "<=", $1, $3)); $$.type = EXPRESSION_TYPE_BOOLEAN; + $$.required_strings.count = 0; } | primary_expression _GE_ primary_expression { @@ -1928,6 +2016,7 @@ expression yyscanner, ">=", $1, $3)); $$.type = EXPRESSION_TYPE_BOOLEAN; + $$.required_strings.count = 0; } | primary_expression _EQ_ primary_expression { @@ -1935,6 +2024,7 @@ expression yyscanner, "==", $1, $3)); $$.type = EXPRESSION_TYPE_BOOLEAN; + $$.required_strings.count = 0; } | primary_expression _NEQ_ primary_expression { @@ -1942,6 +2032,7 @@ expression yyscanner, "!=", $1, $3)); $$.type = EXPRESSION_TYPE_BOOLEAN; + $$.required_strings.count = 0; } | primary_expression { diff --git a/libyara/include/yara/compiler.h b/libyara/include/yara/compiler.h index acd173c8ca..a5bd5608f7 100644 --- a/libyara/include/yara/compiler.h +++ b/libyara/include/yara/compiler.h @@ -93,6 +93,12 @@ typedef struct _YR_EXPRESSION YR_ARENA_REF sized_string_ref; } value; + // Boolean expressions can hold a string count. If not empty, this indicates that the condition + // can only be fulfilled if at least so many strings match. + struct { + int count; + } required_strings; + // An expression can have an associated identifier, if "ptr" is not NULL it // points to the identifier name, if it is NULL, then "ref" holds a reference // to the identifier within YR_SZ_POOL. When the identifier is in YR_SZ_POOL diff --git a/libyara/include/yara/types.h b/libyara/include/yara/types.h index acfc832936..2d3b1340ef 100644 --- a/libyara/include/yara/types.h +++ b/libyara/include/yara/types.h @@ -293,6 +293,8 @@ struct YR_RULE // Number of atoms generated for this rule. int32_t num_atoms; + uint32_t required_strings; + DECLARE_REFERENCE(const char*, identifier); DECLARE_REFERENCE(const char*, tags); DECLARE_REFERENCE(YR_META*, metas); @@ -611,6 +613,10 @@ struct YR_RULES // the instructions are defined by the OP_X macros in exec.h. const uint8_t* code_start; + // A bitmap with one bit per rule, bit N is set when the condition for rule + // might evaluate to true even without any string matches. + YR_BITMASK* rule_evaluate_condition_flags; + // Total number of rules. uint32_t num_rules; @@ -815,6 +821,10 @@ struct YR_SCAN_CONTEXT // until they can be confirmed or discarded. YR_MATCHES* unconfirmed_matches; + // A bitmap with one bit per rule, bit N is unset when the condition for rule + // with index N is guaranteed to evaluate to false. + YR_BITMASK* rule_evaluate_condition_flags; + // profiling_info is a pointer to an array of YR_PROFILING_INFO structures, // one per rule. Entry N has the profiling information for rule with index N. YR_PROFILING_INFO* profiling_info; diff --git a/libyara/rules.c b/libyara/rules.c index 63bfa10ddd..33a8ecc90f 100644 --- a/libyara/rules.c +++ b/libyara/rules.c @@ -336,6 +336,14 @@ int yr_rules_from_arena(YR_ARENA* arena, YR_RULES** rules) if (new_rules == NULL) return ERROR_INSUFFICIENT_MEMORY; + new_rules->rule_evaluate_condition_flags = (YR_BITMASK*) yr_calloc( + sizeof(YR_BITMASK), YR_BITMASK_SIZE(new_rules->num_rules)); + if (new_rules->rule_evaluate_condition_flags == NULL) + { + yr_free(new_rules); + return ERROR_INSUFFICIENT_MEMORY; + } + // Now YR_RULES relies on this arena, let's increment the arena's // reference count so that if the original owner of the arena calls // yr_arena_destroy the arena is not destroyed. @@ -364,6 +372,17 @@ int yr_rules_from_arena(YR_ARENA* arena, YR_RULES** rules) new_rules->code_start = yr_arena_get_ptr(arena, YR_CODE_SECTION, 0); + // If a rule has no required_strings, this means that the condition might + // evaluate to true without any matching strings, and we therefore have to + // mark it as "to be evaluated" from the beginning. + for (int i = 0; i < new_rules->num_rules; i++) + { + if (new_rules->rules_table[i].required_strings == 0) + { + yr_bitmask_set(new_rules->rule_evaluate_condition_flags, i); + } + } + *rules = new_rules; return ERROR_SUCCESS; @@ -524,6 +543,8 @@ YR_API int yr_rules_destroy(YR_RULES* rules) external++; } + yr_free(rules->rule_evaluate_condition_flags); + yr_arena_release(rules->arena); yr_free(rules); diff --git a/libyara/scan.c b/libyara/scan.c index c96a625975..2dcae30e36 100644 --- a/libyara/scan.c +++ b/libyara/scan.c @@ -575,6 +575,8 @@ static int _yr_scan_verify_chained_string_match( match_data - match_offset + match->offset, match->data_length); + yr_bitmask_set(context->rule_evaluate_condition_flags, string->rule_idx); + FAIL_ON_ERROR(_yr_scan_add_match_to_list( match, &context->matches[string->idx], false)); } @@ -750,6 +752,8 @@ static int _yr_scan_match_callback( new_match->is_private = STRING_IS_PRIVATE(string); new_match->xor_key = callback_args->xor_key; + yr_bitmask_set(callback_args->context->rule_evaluate_condition_flags, string->rule_idx); + FAIL_ON_ERROR(_yr_scan_add_match_to_list( new_match, &callback_args->context->matches[string->idx], diff --git a/libyara/scanner.c b/libyara/scanner.c index 8e4d079e42..342bdcbfc7 100644 --- a/libyara/scanner.c +++ b/libyara/scanner.c @@ -213,6 +213,11 @@ static void _yr_scanner_clean_matches(YR_SCANNER* scanner) 0, sizeof(YR_BITMASK) * YR_BITMASK_SIZE(scanner->rules->num_rules)); + memset( + scanner->rule_evaluate_condition_flags, + 0, + sizeof(YR_BITMASK) * YR_BITMASK_SIZE(scanner->rules->num_rules)); + memset( scanner->ns_unsatisfied_flags, 0, @@ -259,6 +264,9 @@ YR_API int yr_scanner_create(YR_RULES* rules, YR_SCANNER** scanner) new_scanner->rule_matches_flags = (YR_BITMASK*) yr_calloc( sizeof(YR_BITMASK), YR_BITMASK_SIZE(rules->num_rules)); + new_scanner->rule_evaluate_condition_flags = (YR_BITMASK*) yr_calloc( + sizeof(YR_BITMASK), YR_BITMASK_SIZE(rules->num_rules)); + new_scanner->ns_unsatisfied_flags = (YR_BITMASK*) yr_calloc( sizeof(YR_BITMASK), YR_BITMASK_SIZE(rules->num_namespaces)); @@ -272,6 +280,7 @@ YR_API int yr_scanner_create(YR_RULES* rules, YR_SCANNER** scanner) rules->num_strings, sizeof(YR_MATCHES)); if (new_scanner->rule_matches_flags == NULL || + new_scanner->rule_evaluate_condition_flags == NULL || new_scanner->ns_unsatisfied_flags == NULL || new_scanner->strings_temp_disabled == NULL || new_scanner->matches == NULL || // @@ -358,6 +367,7 @@ YR_API void yr_scanner_destroy(YR_SCANNER* scanner) yr_free(scanner->rule_matches_flags); yr_free(scanner->ns_unsatisfied_flags); + yr_free(scanner->rule_evaluate_condition_flags); yr_free(scanner->strings_temp_disabled); yr_free(scanner->matches); yr_free(scanner->unconfirmed_matches); @@ -498,6 +508,11 @@ YR_API int yr_scanner_scan_mem_blocks( if (result != ERROR_SUCCESS) goto _exit; + memcpy( + scanner->rule_evaluate_condition_flags, + scanner->rules->rule_evaluate_condition_flags, + sizeof(YR_BITMASK) * YR_BITMASK_SIZE(rules->num_rules)); + yr_stopwatch_start(&scanner->stopwatch); block = iterator->first(iterator);