From a486751b0c8550d6ebcef93ad7eafd2eb81ae842 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Mon, 11 Mar 2024 13:29:28 -0400 Subject: [PATCH] use simplified Bitmap (#120) Replace `bitmaps::Bitmap` with a simplified bitmap implementation for the 256 wide use case needed by boreal. This does a few things: * Provide a simplified interface for Bitmap based on what's used by boreal * Replaces a bunch of casts & slice indexing that could cause future issues with compile-time validation (by moving from indexing into the Bitmap by usize to u8) * Removes a MPL2 licensed dependency (which can be incompatible with --------- Co-authored-by: Brian Caswell --- Cargo.lock | 7 -- boreal/Cargo.toml | 3 - boreal/src/bitmaps.rs | 146 ++++++++++++++++++++++++++++ boreal/src/lib.rs | 1 + boreal/src/matcher/analysis.rs | 2 +- boreal/src/matcher/literals.rs | 36 +++---- boreal/src/matcher/only_literals.rs | 10 +- boreal/src/regex/hir.rs | 29 +++--- 8 files changed, 179 insertions(+), 55 deletions(-) create mode 100644 boreal/src/bitmaps.rs diff --git a/Cargo.lock b/Cargo.lock index 7dc06c5c..86c971ca 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -124,12 +124,6 @@ version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" -[[package]] -name = "bitmaps" -version = "3.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1d084b0137aaa901caf9f1e8b21daa6aa24d41cd806e111335541eff9683bd6" - [[package]] name = "block-buffer" version = "0.10.4" @@ -146,7 +140,6 @@ dependencies = [ "aho-corasick", "authenticode-parser", "base64", - "bitmaps", "boreal-parser", "codespan-reporting", "crc32fast", diff --git a/boreal/Cargo.toml b/boreal/Cargo.toml index 457736dc..9260b0fc 100644 --- a/boreal/Cargo.toml +++ b/boreal/Cargo.toml @@ -51,9 +51,6 @@ regex-automata = { version = "0.4", default-features = false, features = ["std", # No default features to disable unicode, we do not need it regex-syntax = { version = "0.8", default-features = false } -# Bitmap used during compilation of strings -bitmaps = "3.2" - # "hash" feature crc32fast = { version = "1.4", optional = true } hex = { version = "0.4", optional = true } diff --git a/boreal/src/bitmaps.rs b/boreal/src/bitmaps.rs new file mode 100644 index 00000000..f8074629 --- /dev/null +++ b/boreal/src/bitmaps.rs @@ -0,0 +1,146 @@ +use std::ops::BitOrAssign; + +#[derive(Debug, Clone, Copy, Default)] +/// A bitmap with 256 bits +pub struct Bitmap { + low: u128, + high: u128, +} + +impl Bitmap { + const HALF: u8 = 128; + + pub fn new() -> Self { + Self::default() + } + + fn mask(bit: u8) -> u128 { + 1u128 << (bit & 127) + } + + fn get_half(&self, bit: u8) -> u128 { + if bit < Self::HALF { + self.low + } else { + self.high + } + } + + fn get_half_mut(&mut self, bit: u8) -> &mut u128 { + if bit < Self::HALF { + &mut self.low + } else { + &mut self.high + } + } + + #[must_use] + #[inline(always)] + pub fn get(&self, bit: u8) -> bool { + let mask = Self::mask(bit); + let half = self.get_half(bit); + half & mask != 0 + } + + #[inline(always)] + pub fn set(&mut self, bit: u8) { + let mask = Self::mask(bit); + let half = self.get_half_mut(bit); + *half |= mask; + } + + #[inline(always)] + pub fn invert(&mut self) { + self.low = !self.low; + self.high = !self.high; + } + + #[inline(always)] + pub fn count_ones(&self) -> usize { + (self.low.count_ones() + self.high.count_ones()) as usize + } + + pub fn iter(&self) -> Iter { + Iter(*self) + } +} + +/// implement `|=` +impl BitOrAssign for Bitmap { + fn bitor_assign(&mut self, rhs: Self) { + self.low |= rhs.low; + self.high |= rhs.high; + } +} + +pub struct Iter(Bitmap); + +impl Iterator for Iter { + type Item = u8; + + fn next(&mut self) -> Option { + let (v, offset) = if self.0.low != 0 { + (&mut self.0.low, 0) + } else if self.0.high != 0 { + (&mut self.0.high, 128) + } else { + return None; + }; + + // Safety: this value is contained in [0; 127] so it always fits in a u8. + let t: u8 = v.trailing_zeros().try_into().unwrap(); + *v &= !(1 << t); + Some(offset + t) + } +} + +#[cfg(test)] +mod test { + use super::Bitmap; + + #[test] + + fn test_bitmap() { + let mut bitmap = Bitmap::new(); + + let indexes = vec![0, 10, 17, 120, 127, 128, 129, 200, 255]; + for i in &indexes { + bitmap.set(*i); + assert!(bitmap.get(*i)); + } + + for i in 0..=255 { + assert_eq!(bitmap.get(i), indexes.contains(&i)); + } + assert_eq!(bitmap.count_ones(), indexes.len()); + + let value = bitmap.iter().collect::>(); + assert_eq!(value, indexes); + + bitmap.invert(); + for i in 0..=255 { + assert_eq!(bitmap.get(i), !indexes.contains(&i)); + } + } + + #[test] + fn test_bitmap_all() { + let mut bitmap = Bitmap::new(); + assert_eq!(bitmap.iter().count(), 0); + bitmap.invert(); + assert_eq!(bitmap.iter().count(), 256); + } + + #[test] + fn test_bitmap_or_assign() { + let mut bitmap = Bitmap::new(); + bitmap.set(10); + bitmap.set(30); + + let mut bitmap2 = Bitmap::new(); + bitmap2.set(20); + bitmap2 |= bitmap; + + assert_eq!(vec![10, 20, 30], bitmap2.iter().collect::>()); + } +} diff --git a/boreal/src/lib.rs b/boreal/src/lib.rs index 6d595c16..b690fb86 100644 --- a/boreal/src/lib.rs +++ b/boreal/src/lib.rs @@ -85,6 +85,7 @@ use tempfile as _; use yara as _; pub(crate) mod atoms; +mod bitmaps; pub mod compiler; pub use compiler::Compiler; mod evaluator; diff --git a/boreal/src/matcher/analysis.rs b/boreal/src/matcher/analysis.rs index 758f9430..80c91970 100644 --- a/boreal/src/matcher/analysis.rs +++ b/boreal/src/matcher/analysis.rs @@ -115,7 +115,7 @@ impl Visitor for HirAnalyser { } Hir::Class(Class { bitmap, .. }) => { if let Some(count) = &mut self.nb_alt_literals { - self.nb_alt_literals = count.checked_mul(bitmap.len()); + self.nb_alt_literals = count.checked_mul(bitmap.count_ones()); } self.has_classes = true; } diff --git a/boreal/src/matcher/literals.rs b/boreal/src/matcher/literals.rs index 5443bf86..aa506b2d 100644 --- a/boreal/src/matcher/literals.rs +++ b/boreal/src/matcher/literals.rs @@ -1,7 +1,7 @@ //! Literal extraction and computation from variable expressions. use crate::atoms::{atoms_rank, byte_rank}; +use crate::bitmaps::Bitmap; use crate::regex::{visit, Class, Hir, VisitAction, Visitor}; -use bitmaps::Bitmap; pub fn get_literals_details(hir: &Hir, dot_all: bool) -> LiteralsDetails { let extractor = visit(hir, Extractor::new(dot_all)); @@ -84,14 +84,14 @@ struct Extractor { #[derive(Debug)] enum HirPartKind { Literal(u8), - Class { bitmap: Bitmap<256> }, + Class { bitmap: Bitmap }, } impl HirPartKind { fn combinations(&self) -> usize { match self { Self::Literal(_) => 1, - Self::Class { bitmap } => bitmap.len(), + Self::Class { bitmap } => bitmap.count_ones(), } } } @@ -197,14 +197,9 @@ fn generate_literals(parts: &[HirPart]) -> Vec> { literals = literals .iter() .flat_map(|prefix| { - bitmap.into_iter().map(|b| { - #[allow(clippy::cast_possible_truncation)] - prefix - .iter() - .copied() - .chain(std::iter::once(b as u8)) - .collect() - }) + bitmap + .iter() + .map(|b| prefix.iter().copied().chain(std::iter::once(b)).collect()) }) .collect(); } @@ -296,19 +291,14 @@ fn get_parts_rank(parts: &[HirPart]) -> Option { HirPartKind::Literal(b) => { quality += byte_rank(*b); - if !bitmap.get(*b as usize) { - let _r = bitmap.set(*b as usize, true); + if !bitmap.get(*b) { + bitmap.set(*b); nb_uniq += 1; } } - #[allow(clippy::cast_possible_truncation)] HirPartKind::Class { bitmap: class } => { - quality += class - .into_iter() - .map(|v| byte_rank(v as u8)) - .min() - .unwrap_or(0); - if class.into_iter().any(|b| !bitmap.get(b)) { + quality += class.iter().map(byte_rank).min().unwrap_or(0); + if class.iter().any(|b| !bitmap.get(b)) { nb_uniq += 1; } bitmap |= *class; @@ -355,7 +345,7 @@ impl Visitor for Extractor { Hir::Dot => { let mut bitmap = Bitmap::new(); if !self.dot_all { - let _r = bitmap.set(usize::from(b'\n'), true); + bitmap.set(b'\n'); } bitmap.invert(); self.add_part(HirPartKind::Class { bitmap }); @@ -373,11 +363,11 @@ impl Visitor for Extractor { let mut bitmap = Bitmap::new(); if *mask == 0x0F { for c in 0..=15 { - let _ = bitmap.set(usize::from((c << 4) | *value), true); + bitmap.set((c << 4) | *value); } } else { for c in 0..=15 { - let _ = bitmap.set(usize::from(c | *value), true); + bitmap.set(c | *value); } } if *negated { diff --git a/boreal/src/matcher/only_literals.rs b/boreal/src/matcher/only_literals.rs index e57df93b..4a730c6a 100644 --- a/boreal/src/matcher/only_literals.rs +++ b/boreal/src/matcher/only_literals.rs @@ -1,5 +1,4 @@ -use bitmaps::Bitmap; - +use crate::bitmaps::Bitmap; use crate::regex::{visit, Class, Hir, VisitAction, Visitor}; /// Can the hex string be expressed using only literals. @@ -48,7 +47,7 @@ impl Extractor { self.cartesian_product(&suffixes); } - fn add_class(&mut self, bitmap: &Bitmap<256>) { + fn add_class(&mut self, bitmap: &Bitmap) { // First, commit the local buffer, to have a proper list of all possible literals self.commit_buffer(); @@ -56,11 +55,10 @@ impl Extractor { *all = all .iter() .flat_map(|prefix| { - bitmap.into_iter().map(move |byte| { + bitmap.iter().map(move |byte| { let mut v = Vec::with_capacity(prefix.len() + 1); v.extend(prefix); - #[allow(clippy::cast_possible_truncation)] - v.push(byte as u8); + v.push(byte); v }) }) diff --git a/boreal/src/regex/hir.rs b/boreal/src/regex/hir.rs index 3178a502..76d3e5f4 100644 --- a/boreal/src/regex/hir.rs +++ b/boreal/src/regex/hir.rs @@ -1,11 +1,10 @@ -use std::ops::Range; - -use bitmaps::Bitmap; +use crate::bitmaps::Bitmap; use boreal_parser::hex_string::{Mask, Token}; use boreal_parser::regex::{ AssertionKind, BracketedClass, BracketedClassItem, ClassKind, Literal, LiteralChar, Node, PerlClass, PerlClassKind, RepetitionKind, RepetitionRange, }; +use std::ops::Range; /// HIR of a regular expression. /// @@ -83,7 +82,7 @@ pub struct Class { pub definition: ClassKind, /// Bitfield of which bytes are in the class. - pub bitmap: Bitmap<256>, + pub bitmap: Bitmap, } /// Convert a parsed regex AST into our HIR. @@ -241,7 +240,7 @@ fn is_meta_character(byte: u8) -> bool { ) } -fn class_to_bitmap(class_kind: &ClassKind, warnings: &mut Vec) -> Bitmap<256> { +fn class_to_bitmap(class_kind: &ClassKind, warnings: &mut Vec) -> Bitmap { match class_kind { ClassKind::Perl(p) => perl_class_to_bitmap(p), ClassKind::Bracketed(BracketedClass { items, negated }) => { @@ -254,13 +253,13 @@ fn class_to_bitmap(class_kind: &ClassKind, warnings: &mut Vec) -> } BracketedClassItem::Literal(lit) => { let byte = unwrap_literal(lit, warnings); - let _ = bitmap.set(usize::from(byte), true); + bitmap.set(byte); } BracketedClassItem::Range(lita, litb) => { let a = unwrap_literal(lita, warnings); let b = unwrap_literal(litb, warnings); for c in a..=b { - let _ = bitmap.set(usize::from(c), true); + bitmap.set(c); } } } @@ -274,31 +273,31 @@ fn class_to_bitmap(class_kind: &ClassKind, warnings: &mut Vec) -> } } -fn perl_class_to_bitmap(cls: &PerlClass) -> Bitmap<256> { +fn perl_class_to_bitmap(cls: &PerlClass) -> Bitmap { let PerlClass { kind, negated } = cls; let mut bitmap = Bitmap::new(); match kind { PerlClassKind::Word => { for c in b'0'..=b'9' { - let _ = bitmap.set(usize::from(c), true); + bitmap.set(c); } for c in b'A'..=b'Z' { - let _ = bitmap.set(usize::from(c), true); + bitmap.set(c); } - let _ = bitmap.set(usize::from(b'_'), true); + bitmap.set(b'_'); for c in b'a'..=b'z' { - let _ = bitmap.set(usize::from(c), true); + bitmap.set(c); } } PerlClassKind::Space => { for c in [b'\t', b'\n', b'\x0B', b'\x0C', b'\r', b' '] { - let _ = bitmap.set(usize::from(c), true); + bitmap.set(c); } } PerlClassKind::Digit => { for c in b'0'..=b'9' { - let _ = bitmap.set(usize::from(c), true); + bitmap.set(c); } } } @@ -320,7 +319,7 @@ impl From for Hir { Token::Byte(b) => Hir::Literal(b), Token::NotByte(b) => { let mut bitmap = Bitmap::new(); - let _ = bitmap.set(usize::from(b), true); + bitmap.set(b); bitmap.invert(); Hir::Class(Class {