diff --git a/Cargo.toml b/Cargo.toml index e8c49132..6d023b61 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,3 +49,7 @@ criterion = "0.5.1" [[bench]] name = "run-program" harness = false + +[[bench]] +name = "deserialize" +harness = false diff --git a/benches/block_af9c3d98.bin b/benches/block_af9c3d98.bin new file mode 100644 index 00000000..c14839da Binary files /dev/null and b/benches/block_af9c3d98.bin differ diff --git a/benches/deserialize.rs b/benches/deserialize.rs new file mode 100644 index 00000000..9c5eb636 --- /dev/null +++ b/benches/deserialize.rs @@ -0,0 +1,58 @@ +use clvmr::allocator::Allocator; +use criterion::{criterion_group, criterion_main, Criterion, SamplingMode}; +use std::time::Instant; +use std::include_bytes; +use clvmr::serde::serialized_length_from_bytes; +use clvmr::serde::serialized_length_from_bytes_trusted; +use clvmr::serde::node_from_bytes_backrefs; +use clvmr::serde::node_from_bytes; + +fn deserialize_benchmark(c: &mut Criterion) { + let block = include_bytes!("block_af9c3d98.bin"); + + let mut group = c.benchmark_group("deserialize"); + group.sample_size(10); + group.sampling_mode(SamplingMode::Flat); + + group.bench_function("serialized_length_from_bytes", |b| { + b.iter(|| { + let start = Instant::now(); + let _ = serialized_length_from_bytes(block); + start.elapsed() + }) + }); + + group.bench_function("serialized_length_from_bytes_trusted", |b| { + b.iter(|| { + let start = Instant::now(); + let _ = serialized_length_from_bytes_trusted(block); + start.elapsed() + }) + }); + + let mut a = Allocator::new(); + let iter_checkpoint = a.checkpoint(); + + group.bench_function("node_from_bytes_backrefs", |b| { + b.iter(|| { + a.restore_checkpoint(&iter_checkpoint); + let start = Instant::now(); + let _ = node_from_bytes_backrefs(&mut a, block); + start.elapsed() + }) + }); + + group.bench_function("node_from_bytes", |b| { + b.iter(|| { + a.restore_checkpoint(&iter_checkpoint); + let start = Instant::now(); + let _ = node_from_bytes(&mut a, block); + start.elapsed() + }) + }); + + group.finish(); +} + +criterion_group!(deserialize, deserialize_benchmark); +criterion_main!(deserialize); diff --git a/benches/run-program.rs b/benches/run-program.rs index ef1cb729..9df8322f 100644 --- a/benches/run-program.rs +++ b/benches/run-program.rs @@ -224,7 +224,6 @@ fn run_program_benchmark(c: &mut Criterion) { ] { a.restore_checkpoint(&test_case_checkpoint); - println!("benchmark/{test}.hex"); let prg = read_to_string(format!("benchmark/{test}.hex")) .expect("failed to load benchmark program"); let prg = hex::decode(prg.trim()).expect("invalid hex in benchmark program"); diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 77c475d3..87480146 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -27,6 +27,12 @@ path = "fuzz_targets/serialized_length.rs" test = false doc = false +[[bin]] +name = "fuzz_serialized_length_trusted" +path = "fuzz_targets/serialized_length_trusted.rs" +test = false +doc = false + [[bin]] name = "fuzz_deserialize" path = "fuzz_targets/deserialize.rs" diff --git a/fuzz/fuzz_targets/serialized_length.rs b/fuzz/fuzz_targets/serialized_length.rs index c0ce5ca4..339bfa7d 100644 --- a/fuzz/fuzz_targets/serialized_length.rs +++ b/fuzz/fuzz_targets/serialized_length.rs @@ -1,12 +1,31 @@ #![no_main] +use clvmr::serde::node_from_bytes_backrefs; +use clvmr::serde::node_to_bytes; use clvmr::serde::serialized_length_from_bytes; +use clvmr::Allocator; use libfuzzer_sys::fuzz_target; fuzz_target!(|data: &[u8]| { - let _len = match serialized_length_from_bytes(data) { - Err(_) => { - return; + let len = serialized_length_from_bytes(data); + + let mut allocator = Allocator::new(); + let program = node_from_bytes_backrefs(&mut allocator, data); + + match (len, program) { + (Ok(_), Ok(_)) => { + // this is expected + } + (Err(_), Err(_)) => { + // this is expected + } + (Ok(len), Err(e)) => { + panic!("discrepancy between serialized_length and node_from_bytes_backrefs().\n {len}\n{e}"); + } + (Err(e), Ok(program)) => { + panic!( + "discrepancy between serialized_length and node_from_bytes_backrefs().\n {e}\n{:?}", + node_to_bytes(&allocator, program) + ); } - Ok(r) => r, - }; + } }); diff --git a/fuzz/fuzz_targets/serialized_length_trusted.rs b/fuzz/fuzz_targets/serialized_length_trusted.rs new file mode 100644 index 00000000..e5efe8a3 --- /dev/null +++ b/fuzz/fuzz_targets/serialized_length_trusted.rs @@ -0,0 +1,12 @@ +#![no_main] +use clvmr::serde::serialized_length_from_bytes_trusted; +use libfuzzer_sys::fuzz_target; + +fuzz_target!(|data: &[u8]| { + let _len = match serialized_length_from_bytes_trusted(data) { + Err(_) => { + return; + } + Ok(r) => r, + }; +}); diff --git a/src/serde/mod.rs b/src/serde/mod.rs index f3e9ad7d..430629a6 100644 --- a/src/serde/mod.rs +++ b/src/serde/mod.rs @@ -20,4 +20,6 @@ pub use de_br::node_from_bytes_backrefs; pub use de_tree::{parse_triples, ParsedTriple}; pub use ser::node_to_bytes; pub use ser_br::node_to_bytes_backrefs; -pub use tools::{serialized_length_from_bytes, tree_hash_from_stream}; +pub use tools::{ + serialized_length_from_bytes, serialized_length_from_bytes_trusted, tree_hash_from_stream, +}; diff --git a/src/serde/tools.rs b/src/serde/tools.rs index d2fcad88..7d4cd03f 100644 --- a/src/serde/tools.rs +++ b/src/serde/tools.rs @@ -8,7 +8,7 @@ const MAX_SINGLE_BYTE: u8 = 0x7f; const BACK_REFERENCE: u8 = 0xfe; const CONS_BOX_MARKER: u8 = 0xff; -pub fn serialized_length_from_bytes(b: &[u8]) -> io::Result { +pub fn serialized_length_from_bytes_trusted(b: &[u8]) -> io::Result { let mut f = Cursor::new(b); let mut ops_counter = 1; let mut b = [0; 1]; @@ -107,6 +107,70 @@ pub fn tree_hash_from_stream(f: &mut Cursor<&[u8]>) -> io::Result<[u8; 32]> { Ok(values.pop().unwrap()) } +/// validate that a buffer is a valid CLVM serialization, and return the length +/// of the CLVM object. This may fail if the serialization contains an invalid +/// back-reference or if the buffer is truncated. +pub fn serialized_length_from_bytes(b: &[u8]) -> io::Result { + use crate::serde::parse_atom::parse_path; + use crate::traverse_path::traverse_path; + use crate::{allocator::SExp, Allocator}; + + let mut f = Cursor::new(b); + let mut b = [0; 1]; + + // the allocator is just used to track the tree structure, in order to + // validate back-references + let mut allocator = Allocator::new(); + let null = allocator.null(); + let mut values = null; + let mut ops = vec![ParseOp::SExp]; + + while let Some(op) = ops.pop() { + match op { + ParseOp::SExp => { + f.read_exact(&mut b)?; + if b[0] == CONS_BOX_MARKER { + ops.push(ParseOp::Cons); + ops.push(ParseOp::SExp); + ops.push(ParseOp::SExp); + } else if b[0] == BACK_REFERENCE { + let path = parse_path(&mut f)?; + let back_reference = traverse_path(&allocator, path, values)?.1; + values = allocator.new_pair(back_reference, values)?; + } else if b[0] == 0x80 || b[0] <= MAX_SINGLE_BYTE { + // This one byte we just read was the whole atom. + // or the special case of NIL + values = allocator.new_pair(null, values)?; + } else { + let blob_size = decode_size(&mut f, b[0])?; + f.seek(SeekFrom::Current(blob_size as i64))?; + if (f.get_ref().len() as u64) < f.position() { + return Err(bad_encoding()); + } + values = allocator.new_pair(null, values)?; + } + } + ParseOp::Cons => { + // cons + let SExp::Pair(v1, v2) = allocator.sexp(values) else { + return Err(bad_encoding()); + }; + + let SExp::Pair(v3, v4) = allocator.sexp(v2) else { + return Err(bad_encoding()); + }; + + let new_root = allocator.new_pair(v3, v1)?; + values = allocator.new_pair(new_root, v4)?; + } + } + } + match allocator.sexp(values) { + SExp::Pair(_, _) => Ok(f.position()), + _ => Err(bad_encoding()), + } +} + #[test] fn test_tree_hash_max_single_byte() { let mut ctx = Sha256::new(); @@ -212,43 +276,6 @@ fn test_tree_hash_tree_large_atom() { ); } -#[test] -fn test_serialized_length_from_bytes() { - assert_eq!( - serialized_length_from_bytes(&[0x7f, 0x00, 0x00, 0x00]).unwrap(), - 1 - ); - assert_eq!( - serialized_length_from_bytes(&[0x80, 0x00, 0x00, 0x00]).unwrap(), - 1 - ); - assert_eq!( - serialized_length_from_bytes(&[0xff, 0x00, 0x00, 0x00]).unwrap(), - 3 - ); - assert_eq!( - serialized_length_from_bytes(&[0xff, 0x01, 0xff, 0x80, 0x80, 0x00]).unwrap(), - 5 - ); - - let e = serialized_length_from_bytes(&[0x8f, 0xff]).unwrap_err(); - assert_eq!(e.kind(), bad_encoding().kind()); - assert_eq!(e.to_string(), "bad encoding"); - - let e = serialized_length_from_bytes(&[0b11001111, 0xff]).unwrap_err(); - assert_eq!(e.kind(), bad_encoding().kind()); - assert_eq!(e.to_string(), "bad encoding"); - - let e = serialized_length_from_bytes(&[0b11001111, 0xff, 0, 0]).unwrap_err(); - assert_eq!(e.kind(), bad_encoding().kind()); - assert_eq!(e.to_string(), "bad encoding"); - - assert_eq!( - serialized_length_from_bytes(&[0x8f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]).unwrap(), - 16 - ); -} - #[cfg(test)] mod test { use super::*; @@ -256,6 +283,99 @@ mod test { use crate::Allocator; use rstest::rstest; + #[test] + fn test_serialized_length_from_bytes_trusted() { + assert_eq!( + serialized_length_from_bytes_trusted(&[0x7f, 0x00, 0x00, 0x00]).unwrap(), + 1 + ); + assert_eq!( + serialized_length_from_bytes_trusted(&[0x80, 0x00, 0x00, 0x00]).unwrap(), + 1 + ); + assert_eq!( + serialized_length_from_bytes_trusted(&[0xff, 0x00, 0x00, 0x00]).unwrap(), + 3 + ); + assert_eq!( + serialized_length_from_bytes_trusted(&[0xff, 0x01, 0xff, 0x80, 0x80, 0x00]).unwrap(), + 5 + ); + + // this is an invalid back-ref + // but it's not validated + assert_eq!( + serialized_length_from_bytes_trusted(&[0xff, 0x01, 0xff, 0xfe, 0x10, 0x80, 0x00]) + .unwrap(), + 6 + ); + + let e = serialized_length_from_bytes_trusted(&[0x8f, 0xff]).unwrap_err(); + assert_eq!(e.kind(), bad_encoding().kind()); + assert_eq!(e.to_string(), "bad encoding"); + + let e = serialized_length_from_bytes_trusted(&[0b11001111, 0xff]).unwrap_err(); + assert_eq!(e.kind(), bad_encoding().kind()); + assert_eq!(e.to_string(), "bad encoding"); + + let e = serialized_length_from_bytes_trusted(&[0b11001111, 0xff, 0, 0]).unwrap_err(); + assert_eq!(e.kind(), bad_encoding().kind()); + assert_eq!(e.to_string(), "bad encoding"); + + assert_eq!( + serialized_length_from_bytes_trusted(&[ + 0x8f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + ]) + .unwrap(), + 16 + ); + } + + #[test] + fn test_serialized_length_from_bytes() { + use std::io::ErrorKind; + assert_eq!( + serialized_length_from_bytes(&[0x7f, 0x00, 0x00, 0x00]).unwrap(), + 1 + ); + assert_eq!( + serialized_length_from_bytes(&[0x80, 0x00, 0x00, 0x00]).unwrap(), + 1 + ); + assert_eq!( + serialized_length_from_bytes(&[0xff, 0x00, 0x00, 0x00]).unwrap(), + 3 + ); + assert_eq!( + serialized_length_from_bytes(&[0xff, 0x01, 0xff, 0x80, 0x80, 0x00]).unwrap(), + 5 + ); + + // this is an invalid back-ref + let e = + serialized_length_from_bytes(&[0xff, 0x01, 0xff, 0xfe, 0x10, 0x80, 0x00]).unwrap_err(); + assert_eq!(e.kind(), ErrorKind::Other); + assert_eq!(e.to_string(), "path into atom"); + + let e = serialized_length_from_bytes(&[0x8f, 0xff]).unwrap_err(); + assert_eq!(e.kind(), bad_encoding().kind()); + assert_eq!(e.to_string(), "bad encoding"); + + let e = serialized_length_from_bytes(&[0b11001111, 0xff]).unwrap_err(); + assert_eq!(e.kind(), bad_encoding().kind()); + assert_eq!(e.to_string(), "bad encoding"); + + let e = serialized_length_from_bytes(&[0b11001111, 0xff, 0, 0]).unwrap_err(); + assert_eq!(e.kind(), bad_encoding().kind()); + assert_eq!(e.to_string(), "bad encoding"); + + assert_eq!( + serialized_length_from_bytes(&[0x8f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + .unwrap(), + 16 + ); + } + #[rstest] // ("foobar" "foobar") #[case("ff86666f6f626172ff86666f6f62617280")]