Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use simplfied Bitmap #120

Merged
merged 12 commits into from
Mar 11, 2024
7 changes: 0 additions & 7 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 0 additions & 3 deletions boreal/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,6 @@ regex-automata = { version = "0.4", default-features = false, features = ["std",
# No default features to disable unicode, we do not need it
regex-syntax = { version = "0.8", default-features = false }

# Bitmap used during compilation of strings
bitmaps = "3.2"

# "hash" feature
crc32fast = { version = "1.4", optional = true }
hex = { version = "0.4", optional = true }
Expand Down
146 changes: 146 additions & 0 deletions boreal/src/bitmaps.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
use std::ops::BitOrAssign;

#[derive(Debug, Clone, Copy, Default)]
/// A bitmap with 256 bits
pub struct Bitmap {
low: u128,
high: u128,
}

impl Bitmap {
const HALF: u8 = 128;

pub fn new() -> Self {
Self::default()
}

fn mask(bit: u8) -> u128 {
1u128 << (bit & 127)
}

fn get_half(&self, bit: u8) -> u128 {
if bit < Self::HALF {
self.low
} else {
self.high
}
}

fn get_half_mut(&mut self, bit: u8) -> &mut u128 {
if bit < Self::HALF {
&mut self.low
} else {
&mut self.high
}
}

#[must_use]
#[inline(always)]
pub fn get(&self, bit: u8) -> bool {
let mask = Self::mask(bit);
let half = self.get_half(bit);
half & mask != 0
}

#[inline(always)]
pub fn set(&mut self, bit: u8) {
let mask = Self::mask(bit);
let half = self.get_half_mut(bit);
*half |= mask;
}

#[inline(always)]
pub fn invert(&mut self) {
self.low = !self.low;
self.high = !self.high;
}

#[inline(always)]
pub fn count_ones(&self) -> usize {
(self.low.count_ones() + self.high.count_ones()) as usize
}

pub fn iter(&self) -> Iter {
Iter(*self)
}
}

/// implement `|=`
impl BitOrAssign for Bitmap {
fn bitor_assign(&mut self, rhs: Self) {
self.low |= rhs.low;
self.high |= rhs.high;
}
}

pub struct Iter(Bitmap);

impl Iterator for Iter {
type Item = u8;

fn next(&mut self) -> Option<Self::Item> {
demoray marked this conversation as resolved.
Show resolved Hide resolved
let (v, offset) = if self.0.low != 0 {
(&mut self.0.low, 0)
} else if self.0.high != 0 {
(&mut self.0.high, 128)
} else {
return None;
};

// Safety: this value is contained in [0; 127] so it always fits in a u8.
let t: u8 = v.trailing_zeros().try_into().unwrap();
*v &= !(1 << t);
Some(offset + t)
}
}

#[cfg(test)]
mod test {
use super::Bitmap;

#[test]

fn test_bitmap() {
let mut bitmap = Bitmap::new();

let indexes = vec![0, 10, 17, 120, 127, 128, 129, 200, 255];
for i in &indexes {
bitmap.set(*i);
assert!(bitmap.get(*i));
}

for i in 0..=255 {
assert_eq!(bitmap.get(i), indexes.contains(&i));
}
assert_eq!(bitmap.count_ones(), indexes.len());

let value = bitmap.iter().collect::<Vec<_>>();
assert_eq!(value, indexes);

bitmap.invert();
for i in 0..=255 {
assert_eq!(bitmap.get(i), !indexes.contains(&i));
}
}

#[test]
fn test_bitmap_all() {
let mut bitmap = Bitmap::new();
assert_eq!(bitmap.iter().count(), 0);
bitmap.invert();
assert_eq!(bitmap.iter().count(), 256);
}

#[test]
fn test_bitmap_or_assign() {
let mut bitmap = Bitmap::new();
bitmap.set(10);
bitmap.set(30);

let mut bitmap2 = Bitmap::new();
bitmap2.set(20);
bitmap2 |= bitmap;

assert_eq!(vec![10, 20, 30], bitmap2.iter().collect::<Vec<_>>());
}
}
1 change: 1 addition & 0 deletions boreal/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ use tempfile as _;
use yara as _;

pub(crate) mod atoms;
mod bitmaps;
pub mod compiler;
pub use compiler::Compiler;
mod evaluator;
Expand Down
2 changes: 1 addition & 1 deletion boreal/src/matcher/analysis.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ impl Visitor for HirAnalyser {
}
Hir::Class(Class { bitmap, .. }) => {
if let Some(count) = &mut self.nb_alt_literals {
self.nb_alt_literals = count.checked_mul(bitmap.len());
self.nb_alt_literals = count.checked_mul(bitmap.count_ones());
}
self.has_classes = true;
}
Expand Down
36 changes: 13 additions & 23 deletions boreal/src/matcher/literals.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//! Literal extraction and computation from variable expressions.
use crate::atoms::{atoms_rank, byte_rank};
use crate::bitmaps::Bitmap;
use crate::regex::{visit, Class, Hir, VisitAction, Visitor};
use bitmaps::Bitmap;

pub fn get_literals_details(hir: &Hir, dot_all: bool) -> LiteralsDetails {
let extractor = visit(hir, Extractor::new(dot_all));
Expand Down Expand Up @@ -84,14 +84,14 @@ struct Extractor {
#[derive(Debug)]
enum HirPartKind {
Literal(u8),
Class { bitmap: Bitmap<256> },
Class { bitmap: Bitmap },
}

impl HirPartKind {
fn combinations(&self) -> usize {
match self {
Self::Literal(_) => 1,
Self::Class { bitmap } => bitmap.len(),
Self::Class { bitmap } => bitmap.count_ones(),
}
}
}
Expand Down Expand Up @@ -197,14 +197,9 @@ fn generate_literals(parts: &[HirPart]) -> Vec<Vec<u8>> {
literals = literals
.iter()
.flat_map(|prefix| {
bitmap.into_iter().map(|b| {
#[allow(clippy::cast_possible_truncation)]
prefix
.iter()
.copied()
.chain(std::iter::once(b as u8))
.collect()
})
bitmap
.iter()
.map(|b| prefix.iter().copied().chain(std::iter::once(b)).collect())
})
.collect();
}
Expand Down Expand Up @@ -296,19 +291,14 @@ fn get_parts_rank(parts: &[HirPart]) -> Option<u32> {
HirPartKind::Literal(b) => {
quality += byte_rank(*b);

if !bitmap.get(*b as usize) {
let _r = bitmap.set(*b as usize, true);
if !bitmap.get(*b) {
bitmap.set(*b);
nb_uniq += 1;
}
}
#[allow(clippy::cast_possible_truncation)]
HirPartKind::Class { bitmap: class } => {
quality += class
.into_iter()
.map(|v| byte_rank(v as u8))
.min()
.unwrap_or(0);
if class.into_iter().any(|b| !bitmap.get(b)) {
quality += class.iter().map(byte_rank).min().unwrap_or(0);
if class.iter().any(|b| !bitmap.get(b)) {
nb_uniq += 1;
}
bitmap |= *class;
Expand Down Expand Up @@ -355,7 +345,7 @@ impl Visitor for Extractor {
Hir::Dot => {
let mut bitmap = Bitmap::new();
if !self.dot_all {
let _r = bitmap.set(usize::from(b'\n'), true);
bitmap.set(b'\n');
}
bitmap.invert();
self.add_part(HirPartKind::Class { bitmap });
Expand All @@ -373,11 +363,11 @@ impl Visitor for Extractor {
let mut bitmap = Bitmap::new();
if *mask == 0x0F {
for c in 0..=15 {
let _ = bitmap.set(usize::from((c << 4) | *value), true);
bitmap.set((c << 4) | *value);
}
} else {
for c in 0..=15 {
let _ = bitmap.set(usize::from(c | *value), true);
bitmap.set(c | *value);
}
}
if *negated {
Expand Down
10 changes: 4 additions & 6 deletions boreal/src/matcher/only_literals.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use bitmaps::Bitmap;

use crate::bitmaps::Bitmap;
use crate::regex::{visit, Class, Hir, VisitAction, Visitor};

/// Can the hex string be expressed using only literals.
Expand Down Expand Up @@ -48,19 +47,18 @@ impl Extractor {
self.cartesian_product(&suffixes);
}

fn add_class(&mut self, bitmap: &Bitmap<256>) {
fn add_class(&mut self, bitmap: &Bitmap) {
// First, commit the local buffer, to have a proper list of all possible literals
self.commit_buffer();

if let Some(all) = self.all.as_mut() {
*all = all
.iter()
.flat_map(|prefix| {
bitmap.into_iter().map(move |byte| {
bitmap.iter().map(move |byte| {
let mut v = Vec::with_capacity(prefix.len() + 1);
v.extend(prefix);
#[allow(clippy::cast_possible_truncation)]
v.push(byte as u8);
v.push(byte);
v
})
})
Expand Down
Loading
Loading