Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use simplfied Bitmap #120

Merged
merged 12 commits into from
Mar 11, 2024
7 changes: 0 additions & 7 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 0 additions & 3 deletions boreal/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,6 @@ regex-automata = { version = "0.4", default-features = false, features = ["std",
# No default features to disable unicode, we do not need it
regex-syntax = { version = "0.8", default-features = false }

# Bitmap used during compilation of strings
bitmaps = "3.2"

# "hash" feature
crc32fast = { version = "1.4", optional = true }
hex = { version = "0.4", optional = true }
Expand Down
166 changes: 166 additions & 0 deletions boreal/src/bitmaps.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
use std::ops::BitOrAssign;

#[derive(Debug, Clone, Copy, Default)]
/// A bitmap with 256 bits
pub struct Bitmap {
low: u128,
high: u128,
}

impl Bitmap {
const HALF: u8 = 128;

pub fn new() -> Self {
Self::default()
}

fn mask(bit: u8) -> u128 {
1u128 << (bit & 127)
}

fn get_half(&self, bit: u8) -> u128 {
if bit < Self::HALF {
self.low
} else {
self.high
}
}

fn get_half_mut(&mut self, bit: u8) -> &mut u128 {
if bit < Self::HALF {
&mut self.low
} else {
&mut self.high
}
}

#[must_use]
#[inline(always)]
pub fn get(&self, bit: u8) -> bool {
let mask = Self::mask(bit);
let half = self.get_half(bit);
half & mask != 0
}

#[inline(always)]
pub fn set(&mut self, bit: u8, value: bool) {
demoray marked this conversation as resolved.
Show resolved Hide resolved
let mask = Self::mask(bit);
let half = self.get_half_mut(bit);
if value {
*half |= mask;
} else {
*half &= !mask;
}
}

#[inline(always)]
pub fn invert(&mut self) {
self.low = !self.low;
self.high = !self.high;
}

#[inline(always)]
pub fn count_ones(&self) -> usize {
(self.low.count_ones() + self.high.count_ones()) as usize
}

pub fn iter(&self) -> Iter {
Iter(*self)
}
}

/// implement `|=`
impl BitOrAssign for Bitmap {
fn bitor_assign(&mut self, rhs: Self) {
self.low |= rhs.low;
self.high |= rhs.high;
}
}

pub struct Iter(Bitmap);

impl Iterator for Iter {
type Item = u8;

fn next(&mut self) -> Option<Self::Item> {
demoray marked this conversation as resolved.
Show resolved Hide resolved
if self.0.low == 0 && self.0.high == 0 {
return None;
}

// this always result in a value that fits in a u8
let t: u8 = self
.0
.low
.trailing_zeros()
.try_into()
.expect("u128::trailing_zeros always fits into u8");
if t != Bitmap::HALF {
self.0.set(t, false);
return Some(t);
}

// this always result in a value that fits in a u8, and we know there is at
// least one value here due to previous filtering
let mut t: u8 = self
.0
.high
.trailing_zeros()
.try_into()
.expect("u128::trailing_zeros always fits into u8");

t += 128;
self.0.set(t, false);
Some(t)
}
}

#[cfg(test)]
mod test {
use super::Bitmap;

#[test]

fn test_bitmap() {
let mut bitmap = Bitmap::new();

let indexes = vec![0, 10, 17, 120, 127, 128, 129, 200, 255];
for i in &indexes {
bitmap.set(*i, true);
assert!(bitmap.get(*i));
}

for i in 0..=255 {
assert_eq!(bitmap.get(i), indexes.contains(&i));
}
assert_eq!(bitmap.count_ones(), indexes.len());

let value = bitmap.iter().collect::<Vec<_>>();
assert_eq!(value, indexes);

bitmap.invert();
for i in 0..=255 {
assert_eq!(bitmap.get(i), !indexes.contains(&i));
}
}

#[test]
fn test_bitmap_all() {
let mut bitmap = Bitmap::new();
assert_eq!(bitmap.iter().count(), 0);
bitmap.invert();
assert_eq!(bitmap.iter().count(), 256);
}

#[test]
fn test_bitmap_or_assign() {
let mut bitmap = Bitmap::new();
bitmap.set(10, true);
bitmap.set(30, true);

let mut bitmap2 = Bitmap::new();
bitmap2.set(20, true);
bitmap2 |= bitmap;

assert_eq!(vec![10, 20, 30], bitmap2.iter().collect::<Vec<_>>());
}
}
1 change: 1 addition & 0 deletions boreal/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ use tempfile as _;
use yara as _;

pub(crate) mod atoms;
mod bitmaps;
pub mod compiler;
pub use compiler::Compiler;
mod evaluator;
Expand Down
2 changes: 1 addition & 1 deletion boreal/src/matcher/analysis.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ impl Visitor for HirAnalyser {
}
Hir::Class(Class { bitmap, .. }) => {
if let Some(count) = &mut self.nb_alt_literals {
self.nb_alt_literals = count.checked_mul(bitmap.len());
self.nb_alt_literals = count.checked_mul(bitmap.count_ones());
}
self.has_classes = true;
}
Expand Down
32 changes: 12 additions & 20 deletions boreal/src/matcher/literals.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//! Literal extraction and computation from variable expressions.
use crate::atoms::{atoms_rank, byte_rank};
use crate::bitmaps::Bitmap;
use crate::regex::{visit, Class, Hir, VisitAction, Visitor};
use bitmaps::Bitmap;

pub fn get_literals_details(hir: &Hir, dot_all: bool) -> LiteralsDetails {
let extractor = visit(hir, Extractor::new(dot_all));
Expand Down Expand Up @@ -84,14 +84,14 @@ struct Extractor {
#[derive(Debug)]
enum HirPartKind {
Literal(u8),
Class { bitmap: Bitmap<256> },
Class { bitmap: Bitmap },
}

impl HirPartKind {
fn combinations(&self) -> usize {
match self {
Self::Literal(_) => 1,
Self::Class { bitmap } => bitmap.len(),
Self::Class { bitmap } => bitmap.count_ones(),
}
}
}
Expand Down Expand Up @@ -197,13 +197,9 @@ fn generate_literals(parts: &[HirPart]) -> Vec<Vec<u8>> {
literals = literals
.iter()
.flat_map(|prefix| {
bitmap.into_iter().map(|b| {
bitmap.iter().map(|b| {
#[allow(clippy::cast_possible_truncation)]
vthib marked this conversation as resolved.
Show resolved Hide resolved
prefix
.iter()
.copied()
.chain(std::iter::once(b as u8))
.collect()
prefix.iter().copied().chain(std::iter::once(b)).collect()
})
})
.collect();
Expand Down Expand Up @@ -296,19 +292,15 @@ fn get_parts_rank(parts: &[HirPart]) -> Option<u32> {
HirPartKind::Literal(b) => {
quality += byte_rank(*b);

if !bitmap.get(*b as usize) {
let _r = bitmap.set(*b as usize, true);
if !bitmap.get(*b) {
bitmap.set(*b, true);
nb_uniq += 1;
}
}
#[allow(clippy::cast_possible_truncation)]
demoray marked this conversation as resolved.
Show resolved Hide resolved
HirPartKind::Class { bitmap: class } => {
quality += class
.into_iter()
.map(|v| byte_rank(v as u8))
.min()
.unwrap_or(0);
if class.into_iter().any(|b| !bitmap.get(b)) {
quality += class.iter().map(byte_rank).min().unwrap_or(0);
if class.iter().any(|b| !bitmap.get(b)) {
nb_uniq += 1;
}
bitmap |= *class;
Expand Down Expand Up @@ -355,7 +347,7 @@ impl Visitor for Extractor {
Hir::Dot => {
let mut bitmap = Bitmap::new();
if !self.dot_all {
let _r = bitmap.set(usize::from(b'\n'), true);
bitmap.set(b'\n', true);
}
bitmap.invert();
self.add_part(HirPartKind::Class { bitmap });
Expand All @@ -373,11 +365,11 @@ impl Visitor for Extractor {
let mut bitmap = Bitmap::new();
if *mask == 0x0F {
for c in 0..=15 {
let _ = bitmap.set(usize::from((c << 4) | *value), true);
bitmap.set((c << 4) | *value, true);
}
} else {
for c in 0..=15 {
let _ = bitmap.set(usize::from(c | *value), true);
bitmap.set(c | *value, true);
}
}
if *negated {
Expand Down
10 changes: 4 additions & 6 deletions boreal/src/matcher/only_literals.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use bitmaps::Bitmap;

use crate::bitmaps::Bitmap;
use crate::regex::{visit, Class, Hir, VisitAction, Visitor};

/// Can the hex string be expressed using only literals.
Expand Down Expand Up @@ -48,19 +47,18 @@ impl Extractor {
self.cartesian_product(&suffixes);
}

fn add_class(&mut self, bitmap: &Bitmap<256>) {
fn add_class(&mut self, bitmap: &Bitmap) {
// First, commit the local buffer, to have a proper list of all possible literals
self.commit_buffer();

if let Some(all) = self.all.as_mut() {
*all = all
.iter()
.flat_map(|prefix| {
bitmap.into_iter().map(move |byte| {
bitmap.iter().map(move |byte| {
let mut v = Vec::with_capacity(prefix.len() + 1);
v.extend(prefix);
#[allow(clippy::cast_possible_truncation)]
v.push(byte as u8);
v.push(byte);
v
})
})
Expand Down
Loading
Loading