Skip to content

Commit

Permalink
feat: skip bytecode evaluation for some rules without string matches (#…
Browse files Browse the repository at this point in the history
…1927)

* feat: skip bytecode evaluation for some rules without string matches

Optimize a common case where YARA conditions are formed like e.g.
"... and 1 of them and ...", in other words, requiring a string
match to ever be true.
By noting these cases and recording in a bitmap if a string match
occurred, the condition evaluation for these rules can be skipped
entirely in most cases.

* feat: only initialize bitmap once at rules load time

* fix: handle calloc fail
  • Loading branch information
secDre4mer authored Jun 30, 2023
1 parent efa8765 commit 1c309a8
Show file tree
Hide file tree
Showing 7 changed files with 150 additions and 2 deletions.
5 changes: 3 additions & 2 deletions libyara/exec.c
Original file line number Diff line number Diff line change
Expand Up @@ -1167,11 +1167,12 @@ int yr_execute_code(YR_SCAN_CONTEXT* context)
current_rule = &context->rules->rules_table[current_rule_idx];

// If the rule is disabled let's skip its code.
ip = jmp_if(RULE_IS_DISABLED(current_rule), ip);
bool disabled = RULE_IS_DISABLED(current_rule) || yr_bitmask_is_not_set(context->rule_evaluate_condition_flags, current_rule_idx);
ip = jmp_if(disabled, ip);

// Skip the bytes corresponding to the rule's index, but only if not
// taking the jump.
if (!RULE_IS_DISABLED(current_rule))
if (!disabled)
ip += sizeof(uint32_t);

break;
Expand Down
91 changes: 91 additions & 0 deletions libyara/grammar.y
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%type <expression> regexp
%type <expression> for_expression
%type <expression> for_quantifier
%type <expression> condition


%type <c_string> arguments
Expand Down Expand Up @@ -405,6 +406,10 @@ rule
}
condition '}'
{
YR_RULE* rule = (YR_RULE*) yr_arena_ref_to_ptr(
compiler->arena, &$<rule>4);
rule->required_strings = $10.required_strings.count;

int result = yr_parser_reduce_rule_declaration_phase_2(
yyscanner, &$<rule>4); // rule created in phase 1

Expand Down Expand Up @@ -455,6 +460,9 @@ strings

condition
: _CONDITION_ ':' boolean_expression
{
$$ = $3;
}
;


Expand Down Expand Up @@ -992,6 +1000,7 @@ identifier
$$.type = EXPRESSION_TYPE_BOOLEAN;
$$.value.integer = YR_UNDEFINED;
$$.identifier.ptr = NULL;
$$.required_strings.count = 0;
}
else
{
Expand Down Expand Up @@ -1311,6 +1320,14 @@ boolean_expression
fail_if_error(yr_parser_emit(
yyscanner, OP_STR_TO_BOOL, NULL));
}
if ($1.type != EXPRESSION_TYPE_BOOLEAN)
{
$$.required_strings.count = 0;
}
else
{
$$.required_strings.count = $1.required_strings.count;
}

$$.type = EXPRESSION_TYPE_BOOLEAN;
}
Expand All @@ -1322,12 +1339,14 @@ expression
fail_if_error(yr_parser_emit_push_const(yyscanner, 1));

$$.type = EXPRESSION_TYPE_BOOLEAN;
$$.required_strings.count = 0;
}
| _FALSE_
{
fail_if_error(yr_parser_emit_push_const(yyscanner, 0));

$$.type = EXPRESSION_TYPE_BOOLEAN;
$$.required_strings.count = 0;
}
| primary_expression _MATCHES_ regexp
{
Expand All @@ -1340,6 +1359,7 @@ expression
NULL));

$$.type = EXPRESSION_TYPE_BOOLEAN;
$$.required_strings.count = 0;
}
| primary_expression _CONTAINS_ primary_expression
{
Expand All @@ -1350,6 +1370,7 @@ expression
yyscanner, OP_CONTAINS, NULL));

$$.type = EXPRESSION_TYPE_BOOLEAN;
$$.required_strings.count = 0;
}
| primary_expression _ICONTAINS_ primary_expression
{
Expand All @@ -1360,6 +1381,7 @@ expression
yyscanner, OP_ICONTAINS, NULL));

$$.type = EXPRESSION_TYPE_BOOLEAN;
$$.required_strings.count = 0;
}
| primary_expression _STARTSWITH_ primary_expression
{
Expand All @@ -1370,6 +1392,7 @@ expression
yyscanner, OP_STARTSWITH, NULL));

$$.type = EXPRESSION_TYPE_BOOLEAN;
$$.required_strings.count = 0;
}
| primary_expression _ISTARTSWITH_ primary_expression
{
Expand All @@ -1380,6 +1403,7 @@ expression
yyscanner, OP_ISTARTSWITH, NULL));

$$.type = EXPRESSION_TYPE_BOOLEAN;
$$.required_strings.count = 0;
}
| primary_expression _ENDSWITH_ primary_expression
{
Expand All @@ -1390,6 +1414,7 @@ expression
yyscanner, OP_ENDSWITH, NULL));

$$.type = EXPRESSION_TYPE_BOOLEAN;
$$.required_strings.count = 0;
}
| primary_expression _IENDSWITH_ primary_expression
{
Expand All @@ -1400,6 +1425,7 @@ expression
yyscanner, OP_IENDSWITH, NULL));

$$.type = EXPRESSION_TYPE_BOOLEAN;
$$.required_strings.count = 0;
}
| primary_expression _IEQUALS_ primary_expression
{
Expand All @@ -1410,6 +1436,7 @@ expression
yyscanner, OP_IEQUALS, NULL));

$$.type = EXPRESSION_TYPE_BOOLEAN;
$$.required_strings.count = 0;
}
| _STRING_IDENTIFIER_
{
Expand All @@ -1424,6 +1451,7 @@ expression
fail_if_error(result);

$$.type = EXPRESSION_TYPE_BOOLEAN;
$$.required_strings.count = 1;
}
| _STRING_IDENTIFIER_ _AT_ primary_expression
{
Expand All @@ -1438,6 +1466,7 @@ expression

fail_if_error(result);

$$.required_strings.count = 1;
$$.type = EXPRESSION_TYPE_BOOLEAN;
}
| _STRING_IDENTIFIER_ _IN_ range
Expand All @@ -1449,6 +1478,7 @@ expression

fail_if_error(result);

$$.required_strings.count = 1;
$$.type = EXPRESSION_TYPE_BOOLEAN;
}
| _FOR_ for_expression error
Expand Down Expand Up @@ -1691,6 +1721,7 @@ expression
compiler->loop_index--;

$$.type = EXPRESSION_TYPE_BOOLEAN;
$$.required_strings.count = 0;
}
| for_expression _OF_ string_set
{
Expand All @@ -1699,6 +1730,18 @@ expression
yywarning(yyscanner,
"expression always false - requesting %" PRId64 " of %" PRId64 ".", $1.value.integer, $3);
}

if (($1.type == EXPRESSION_TYPE_INTEGER && $1.value.integer > 0) ||
($1.type == EXPRESSION_TYPE_QUANTIFIER &&
($1.value.integer == FOR_EXPRESSION_ALL || $1.value.integer == FOR_EXPRESSION_ANY)))
{
$$.required_strings.count = 1;
}
else
{
$$.required_strings.count = 0;
}

yr_parser_emit_with_arg(yyscanner, OP_OF, OF_STRING_SET, NULL, NULL);

$$.type = EXPRESSION_TYPE_BOOLEAN;
Expand All @@ -1713,6 +1756,7 @@ expression
yr_parser_emit_with_arg(yyscanner, OP_OF, OF_RULE_SET, NULL, NULL);

$$.type = EXPRESSION_TYPE_BOOLEAN;
$$.required_strings.count = 0;
}
| primary_expression '%' _OF_ string_set
{
Expand All @@ -1731,6 +1775,15 @@ expression
fail_with_error(ERROR_INVALID_PERCENTAGE);
}

if (!IS_UNDEFINED($1.value.integer))
{
$$.required_strings.count = 1;
}
else
{
$$.required_strings.count = 0;
}

yr_parser_emit_with_arg(yyscanner, OP_OF_PERCENT, OF_STRING_SET, NULL, NULL);
}
| primary_expression '%' _OF_ rule_set
Expand Down Expand Up @@ -1760,6 +1813,17 @@ expression
"expression always false - requesting %" PRId64 " of %" PRId64 ".", $1.value.integer, $3);
}

if (($1.type == EXPRESSION_TYPE_INTEGER && $1.value.integer > 0) ||
($1.type == EXPRESSION_TYPE_QUANTIFIER &&
($1.value.integer == FOR_EXPRESSION_ALL || $1.value.integer == FOR_EXPRESSION_ANY)))
{
$$.required_strings.count = 1;
}
else
{
$$.required_strings.count = 0;
}

yr_parser_emit(yyscanner, OP_OF_FOUND_IN, NULL);

$$.type = EXPRESSION_TYPE_BOOLEAN;
Expand Down Expand Up @@ -1797,6 +1861,17 @@ expression
"multiple strings at an offset is usually false.");
}

if (($1.type == EXPRESSION_TYPE_INTEGER && $1.value.integer > 0) ||
($1.type == EXPRESSION_TYPE_QUANTIFIER &&
($1.value.integer == FOR_EXPRESSION_ALL || $1.value.integer == FOR_EXPRESSION_ANY)))
{
$$.required_strings.count = 1;
}
else
{
$$.required_strings.count = 0;
}

yr_parser_emit(yyscanner, OP_OF_FOUND_AT, NULL);

$$.type = EXPRESSION_TYPE_BOOLEAN;
Expand All @@ -1806,11 +1881,13 @@ expression
yr_parser_emit(yyscanner, OP_NOT, NULL);

$$.type = EXPRESSION_TYPE_BOOLEAN;
$$.required_strings.count = 0;
}
| _DEFINED_ boolean_expression
{
yr_parser_emit(yyscanner, OP_DEFINED, NULL);
$$.type = EXPRESSION_TYPE_BOOLEAN;
$$.required_strings.count = 0;
}
| boolean_expression _AND_
{
Expand Down Expand Up @@ -1856,6 +1933,7 @@ expression
yr_free(fixup);

$$.type = EXPRESSION_TYPE_BOOLEAN;
$$.required_strings.count = $4.required_strings.count + $1.required_strings.count;
}
| boolean_expression _OR_
{
Expand Down Expand Up @@ -1900,48 +1978,61 @@ expression
yr_free(fixup);

$$.type = EXPRESSION_TYPE_BOOLEAN;

// Set required string count to minimum from both parts
if ($1.required_strings.count > $4.required_strings.count) {
$$.required_strings.count = $4.required_strings.count;
} else {
$$.required_strings.count = $1.required_strings.count;
}
}
| primary_expression _LT_ primary_expression
{
fail_if_error(yr_parser_reduce_operation(
yyscanner, "<", $1, $3));

$$.type = EXPRESSION_TYPE_BOOLEAN;
$$.required_strings.count = 0;
}
| primary_expression _GT_ primary_expression
{
fail_if_error(yr_parser_reduce_operation(
yyscanner, ">", $1, $3));

$$.type = EXPRESSION_TYPE_BOOLEAN;
$$.required_strings.count = 0;
}
| primary_expression _LE_ primary_expression
{
fail_if_error(yr_parser_reduce_operation(
yyscanner, "<=", $1, $3));

$$.type = EXPRESSION_TYPE_BOOLEAN;
$$.required_strings.count = 0;
}
| primary_expression _GE_ primary_expression
{
fail_if_error(yr_parser_reduce_operation(
yyscanner, ">=", $1, $3));

$$.type = EXPRESSION_TYPE_BOOLEAN;
$$.required_strings.count = 0;
}
| primary_expression _EQ_ primary_expression
{
fail_if_error(yr_parser_reduce_operation(
yyscanner, "==", $1, $3));

$$.type = EXPRESSION_TYPE_BOOLEAN;
$$.required_strings.count = 0;
}
| primary_expression _NEQ_ primary_expression
{
fail_if_error(yr_parser_reduce_operation(
yyscanner, "!=", $1, $3));

$$.type = EXPRESSION_TYPE_BOOLEAN;
$$.required_strings.count = 0;
}
| primary_expression
{
Expand Down
6 changes: 6 additions & 0 deletions libyara/include/yara/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,12 @@ typedef struct _YR_EXPRESSION
YR_ARENA_REF sized_string_ref;
} value;

// Boolean expressions can hold a string count. If not empty, this indicates that the condition
// can only be fulfilled if at least so many strings match.
struct {
int count;
} required_strings;

// An expression can have an associated identifier, if "ptr" is not NULL it
// points to the identifier name, if it is NULL, then "ref" holds a reference
// to the identifier within YR_SZ_POOL. When the identifier is in YR_SZ_POOL
Expand Down
10 changes: 10 additions & 0 deletions libyara/include/yara/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,8 @@ struct YR_RULE
// Number of atoms generated for this rule.
int32_t num_atoms;

uint32_t required_strings;

DECLARE_REFERENCE(const char*, identifier);
DECLARE_REFERENCE(const char*, tags);
DECLARE_REFERENCE(YR_META*, metas);
Expand Down Expand Up @@ -611,6 +613,10 @@ struct YR_RULES
// the instructions are defined by the OP_X macros in exec.h.
const uint8_t* code_start;

// A bitmap with one bit per rule, bit N is set when the condition for rule
// might evaluate to true even without any string matches.
YR_BITMASK* rule_evaluate_condition_flags;

// Total number of rules.
uint32_t num_rules;

Expand Down Expand Up @@ -815,6 +821,10 @@ struct YR_SCAN_CONTEXT
// until they can be confirmed or discarded.
YR_MATCHES* unconfirmed_matches;

// A bitmap with one bit per rule, bit N is unset when the condition for rule
// with index N is guaranteed to evaluate to false.
YR_BITMASK* rule_evaluate_condition_flags;

// profiling_info is a pointer to an array of YR_PROFILING_INFO structures,
// one per rule. Entry N has the profiling information for rule with index N.
YR_PROFILING_INFO* profiling_info;
Expand Down
Loading

0 comments on commit 1c309a8

Please sign in to comment.