Skip to content

Commit

Permalink
use simplified Bitmap (#120)
Browse files Browse the repository at this point in the history
Replace `bitmaps::Bitmap` with a simplified bitmap implementation for
the 256 wide use case needed by boreal.

This does a few things:
* Provide a simplified interface for Bitmap based on what's used by
  boreal
* Replaces a bunch of casts & slice indexing that could cause future
  issues with compile-time validation (by moving from indexing into the
  Bitmap by usize to u8)
* Removes a MPL2 licensed dependency (which can be incompatible with

---------

Co-authored-by: Brian Caswell <[email protected]>
  • Loading branch information
demoray and Brian Caswell authored Mar 11, 2024
1 parent 754d11b commit a486751
Show file tree
Hide file tree
Showing 8 changed files with 179 additions and 55 deletions.
7 changes: 0 additions & 7 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 0 additions & 3 deletions boreal/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,6 @@ regex-automata = { version = "0.4", default-features = false, features = ["std",
# No default features to disable unicode, we do not need it
regex-syntax = { version = "0.8", default-features = false }

# Bitmap used during compilation of strings
bitmaps = "3.2"

# "hash" feature
crc32fast = { version = "1.4", optional = true }
hex = { version = "0.4", optional = true }
Expand Down
146 changes: 146 additions & 0 deletions boreal/src/bitmaps.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
use std::ops::BitOrAssign;

#[derive(Debug, Clone, Copy, Default)]
/// A bitmap with 256 bits
pub struct Bitmap {
low: u128,
high: u128,
}

impl Bitmap {
const HALF: u8 = 128;

pub fn new() -> Self {
Self::default()
}

fn mask(bit: u8) -> u128 {
1u128 << (bit & 127)
}

fn get_half(&self, bit: u8) -> u128 {
if bit < Self::HALF {
self.low
} else {
self.high
}
}

fn get_half_mut(&mut self, bit: u8) -> &mut u128 {
if bit < Self::HALF {
&mut self.low
} else {
&mut self.high
}
}

#[must_use]
#[inline(always)]
pub fn get(&self, bit: u8) -> bool {
let mask = Self::mask(bit);
let half = self.get_half(bit);
half & mask != 0
}

#[inline(always)]
pub fn set(&mut self, bit: u8) {
let mask = Self::mask(bit);
let half = self.get_half_mut(bit);
*half |= mask;
}

#[inline(always)]
pub fn invert(&mut self) {
self.low = !self.low;
self.high = !self.high;
}

#[inline(always)]
pub fn count_ones(&self) -> usize {
(self.low.count_ones() + self.high.count_ones()) as usize
}

pub fn iter(&self) -> Iter {
Iter(*self)
}
}

/// implement `|=`
impl BitOrAssign for Bitmap {
fn bitor_assign(&mut self, rhs: Self) {
self.low |= rhs.low;
self.high |= rhs.high;
}
}

pub struct Iter(Bitmap);

impl Iterator for Iter {
type Item = u8;

fn next(&mut self) -> Option<Self::Item> {
let (v, offset) = if self.0.low != 0 {
(&mut self.0.low, 0)
} else if self.0.high != 0 {
(&mut self.0.high, 128)
} else {
return None;
};

// Safety: this value is contained in [0; 127] so it always fits in a u8.
let t: u8 = v.trailing_zeros().try_into().unwrap();
*v &= !(1 << t);
Some(offset + t)
}
}

#[cfg(test)]
mod test {
use super::Bitmap;

#[test]

fn test_bitmap() {
let mut bitmap = Bitmap::new();

let indexes = vec![0, 10, 17, 120, 127, 128, 129, 200, 255];
for i in &indexes {
bitmap.set(*i);
assert!(bitmap.get(*i));
}

for i in 0..=255 {
assert_eq!(bitmap.get(i), indexes.contains(&i));
}
assert_eq!(bitmap.count_ones(), indexes.len());

let value = bitmap.iter().collect::<Vec<_>>();
assert_eq!(value, indexes);

bitmap.invert();
for i in 0..=255 {
assert_eq!(bitmap.get(i), !indexes.contains(&i));
}
}

#[test]
fn test_bitmap_all() {
let mut bitmap = Bitmap::new();
assert_eq!(bitmap.iter().count(), 0);
bitmap.invert();
assert_eq!(bitmap.iter().count(), 256);
}

#[test]
fn test_bitmap_or_assign() {
let mut bitmap = Bitmap::new();
bitmap.set(10);
bitmap.set(30);

let mut bitmap2 = Bitmap::new();
bitmap2.set(20);
bitmap2 |= bitmap;

assert_eq!(vec![10, 20, 30], bitmap2.iter().collect::<Vec<_>>());
}
}
1 change: 1 addition & 0 deletions boreal/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ use tempfile as _;
use yara as _;

pub(crate) mod atoms;
mod bitmaps;
pub mod compiler;
pub use compiler::Compiler;
mod evaluator;
Expand Down
2 changes: 1 addition & 1 deletion boreal/src/matcher/analysis.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ impl Visitor for HirAnalyser {
}
Hir::Class(Class { bitmap, .. }) => {
if let Some(count) = &mut self.nb_alt_literals {
self.nb_alt_literals = count.checked_mul(bitmap.len());
self.nb_alt_literals = count.checked_mul(bitmap.count_ones());
}
self.has_classes = true;
}
Expand Down
36 changes: 13 additions & 23 deletions boreal/src/matcher/literals.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//! Literal extraction and computation from variable expressions.
use crate::atoms::{atoms_rank, byte_rank};
use crate::bitmaps::Bitmap;
use crate::regex::{visit, Class, Hir, VisitAction, Visitor};
use bitmaps::Bitmap;

pub fn get_literals_details(hir: &Hir, dot_all: bool) -> LiteralsDetails {
let extractor = visit(hir, Extractor::new(dot_all));
Expand Down Expand Up @@ -84,14 +84,14 @@ struct Extractor {
#[derive(Debug)]
enum HirPartKind {
Literal(u8),
Class { bitmap: Bitmap<256> },
Class { bitmap: Bitmap },
}

impl HirPartKind {
fn combinations(&self) -> usize {
match self {
Self::Literal(_) => 1,
Self::Class { bitmap } => bitmap.len(),
Self::Class { bitmap } => bitmap.count_ones(),
}
}
}
Expand Down Expand Up @@ -197,14 +197,9 @@ fn generate_literals(parts: &[HirPart]) -> Vec<Vec<u8>> {
literals = literals
.iter()
.flat_map(|prefix| {
bitmap.into_iter().map(|b| {
#[allow(clippy::cast_possible_truncation)]
prefix
.iter()
.copied()
.chain(std::iter::once(b as u8))
.collect()
})
bitmap
.iter()
.map(|b| prefix.iter().copied().chain(std::iter::once(b)).collect())
})
.collect();
}
Expand Down Expand Up @@ -296,19 +291,14 @@ fn get_parts_rank(parts: &[HirPart]) -> Option<u32> {
HirPartKind::Literal(b) => {
quality += byte_rank(*b);

if !bitmap.get(*b as usize) {
let _r = bitmap.set(*b as usize, true);
if !bitmap.get(*b) {
bitmap.set(*b);
nb_uniq += 1;
}
}
#[allow(clippy::cast_possible_truncation)]
HirPartKind::Class { bitmap: class } => {
quality += class
.into_iter()
.map(|v| byte_rank(v as u8))
.min()
.unwrap_or(0);
if class.into_iter().any(|b| !bitmap.get(b)) {
quality += class.iter().map(byte_rank).min().unwrap_or(0);
if class.iter().any(|b| !bitmap.get(b)) {
nb_uniq += 1;
}
bitmap |= *class;
Expand Down Expand Up @@ -355,7 +345,7 @@ impl Visitor for Extractor {
Hir::Dot => {
let mut bitmap = Bitmap::new();
if !self.dot_all {
let _r = bitmap.set(usize::from(b'\n'), true);
bitmap.set(b'\n');
}
bitmap.invert();
self.add_part(HirPartKind::Class { bitmap });
Expand All @@ -373,11 +363,11 @@ impl Visitor for Extractor {
let mut bitmap = Bitmap::new();
if *mask == 0x0F {
for c in 0..=15 {
let _ = bitmap.set(usize::from((c << 4) | *value), true);
bitmap.set((c << 4) | *value);
}
} else {
for c in 0..=15 {
let _ = bitmap.set(usize::from(c | *value), true);
bitmap.set(c | *value);
}
}
if *negated {
Expand Down
10 changes: 4 additions & 6 deletions boreal/src/matcher/only_literals.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use bitmaps::Bitmap;

use crate::bitmaps::Bitmap;
use crate::regex::{visit, Class, Hir, VisitAction, Visitor};

/// Can the hex string be expressed using only literals.
Expand Down Expand Up @@ -48,19 +47,18 @@ impl Extractor {
self.cartesian_product(&suffixes);
}

fn add_class(&mut self, bitmap: &Bitmap<256>) {
fn add_class(&mut self, bitmap: &Bitmap) {
// First, commit the local buffer, to have a proper list of all possible literals
self.commit_buffer();

if let Some(all) = self.all.as_mut() {
*all = all
.iter()
.flat_map(|prefix| {
bitmap.into_iter().map(move |byte| {
bitmap.iter().map(move |byte| {
let mut v = Vec::with_capacity(prefix.len() + 1);
v.extend(prefix);
#[allow(clippy::cast_possible_truncation)]
v.push(byte as u8);
v.push(byte);
v
})
})
Expand Down
Loading

0 comments on commit a486751

Please sign in to comment.