From a8581ab78933f9d36104366eff5f7a1055d6c25d Mon Sep 17 00:00:00 2001
From: arvidn <arvid@libtorrent.org>
Date: Tue, 13 Feb 2024 12:20:43 +0100
Subject: [PATCH] improve fidelity of the checks for small atoms to fit in the
 SmallAtom NodePtr representation

---
 fuzz/fuzz_targets/allocator.rs |  6 +++
 src/allocator.rs               | 96 +++++++++++++++++++++-------------
 2 files changed, 65 insertions(+), 37 deletions(-)
diff --git a/fuzz/fuzz_targets/allocator.rs b/fuzz/fuzz_targets/allocator.rs
index 6c23b88c..809beaed 100644
--- a/fuzz/fuzz_targets/allocator.rs
+++ b/fuzz/fuzz_targets/allocator.rs
@@ -1,6 +1,7 @@
 #![no_main]
 use libfuzzer_sys::fuzz_target;
 
+use clvmr::allocator::fits_in_small_atom;
 use clvmr::{Allocator, NodePtr};
 
 fn run_tests(a: &mut Allocator, atom1: NodePtr, data: &[u8]) {
@@ -21,6 +22,11 @@ fn run_tests(a: &mut Allocator, atom1: NodePtr, data: &[u8]) {
         assert_eq!(a.number(atom2), val.into());
         assert_eq!(a.atom_len(atom2), data.len());
         assert!(canonical);
+        assert_eq!(fits_in_small_atom(data), Some(val));
+    } else {
+        assert_eq!(fits_in_small_atom(data), None);
+        let val = a.number(atom1);
+        assert!(!canonical || val < 0.into() || val > ((1 << 26) - 1).into());
     }
 
     // number
diff --git a/src/allocator.rs b/src/allocator.rs
index 92fc610e..4e66e492 100644
--- a/src/allocator.rs
+++ b/src/allocator.rs
@@ -167,21 +167,28 @@ impl Default for Allocator {
     }
 }
 
-pub fn canonical_positive_integer(v: &[u8]) -> bool {
-    if v.is_empty() {
-        // empty buffer is 0/nil
-        true
-    } else if (v.len() == 1 && v[0] == 0)
+pub fn fits_in_small_atom(v: &[u8]) -> Option<u32> {
+    if !v.is_empty()
+        && (v.len() > 4
+        || (v.len() == 1 && v[0] == 0)
         // a 1-byte buffer of 0 is not the canonical representation of 0
         || (v[0] & 0x80) != 0
         // if the top bit is set, it's a negative number (i.e. not positive)
         || (v[0] == 0 && (v[1] & 0x80) == 0)
+        // if the buffer is 4 bytes, the top byte can't use more than 2 bits.
+        // otherwise the integer won't fit in 26 bits
+        || (v.len() == 4 && v[0] > 0x03))
     {
-        // if the top byte is a 0 but the top bit of the next byte is not set, that's a redundant
-        // leading zero. i.e. not canonical representation
-        false
+        // if the top byte is a 0 but the top bit of the next byte is not set,
+        // that's a redundant leading zero. i.e. not canonical representation
+        None
     } else {
-        true
+        let mut ret: u32 = 0;
+        for b in v {
+            ret <<= 8;
+            ret |= *b as u32;
+        }
+        Some(ret)
     }
 }
 
@@ -259,12 +266,7 @@ impl Allocator {
         }
         let idx = self.atom_vec.len();
         self.check_atom_limit()?;
-        if v.len() <= 3 && canonical_positive_integer(v) {
-            let mut ret: u32 = 0;
-            for b in v {
-                ret <<= 8;
-                ret |= *b as u32;
-            }
+        if let Some(ret) = fits_in_small_atom(v) {
             self.small_atoms += 1;
             Ok(NodePtr::new(ObjectType::SmallAtom, ret as usize))
         } else {
@@ -345,7 +347,10 @@ impl Allocator {
                 let buf: [u8; 4] = val.to_be_bytes();
                 let buf = &buf[4 - len as usize..];
                 let substr = &buf[start as usize..end as usize];
-                if !canonical_positive_integer(substr) {
+                if let Some(new_val) = fits_in_small_atom(substr) {
+                    self.small_atoms += 1;
+                    Ok(NodePtr::new(ObjectType::SmallAtom, new_val as usize))
+                } else {
                     let start = self.u8_vec.len();
                     let end = start + substr.len();
                     self.u8_vec.extend_from_slice(substr);
@@ -355,14 +360,6 @@ impl Allocator {
                         end: end as u32,
                     });
                     Ok(NodePtr::new(ObjectType::Bytes, idx))
-                } else {
-                    let mut new_val: u32 = 0;
-                    for i in substr {
-                        new_val <<= 8;
-                        new_val |= *i as u32;
-                    }
-                    self.small_atoms += 1;
-                    Ok(NodePtr::new(ObjectType::SmallAtom, new_val as usize))
                 }
             }
         }
@@ -509,16 +506,7 @@ impl Allocator {
             ObjectType::Bytes => {
                 let atom = self.atom_vec[node.index() as usize];
                 let buf = &self.u8_vec[atom.start as usize..atom.end as usize];
-                if buf.len() > 3 || !canonical_positive_integer(buf) {
-                    None
-                } else {
-                    let mut ret: u32 = 0;
-                    for v in buf {
-                        ret <<= 8;
-                        ret |= *v as u32;
-                    }
-                    Some(ret)
-                }
+                fits_in_small_atom(buf)
             }
             _ => None,
         }
@@ -1808,14 +1796,16 @@ fn test_auto_small_number(#[case] value: Number, #[case] expect_small: bool) {
 #[case(&[0xff], false)]
 #[case(&[0xff, 0xff], false)]
 #[case(&[0x80, 0xff, 0xff], false)]
-// we use a simple heuristic, for atoms. if we have more than 3 bytes, we assume
-// it's not small. Even though it would have fit in 26 bits
-#[case(&[0x1, 0xff, 0xff, 0xff], false)]
 // small positive intergers can be small
 #[case(&[0x01], true)]
 #[case(&[0x00, 0xff], true)]
 #[case(&[0x7f, 0xff], true)]
 #[case(&[0x7f, 0xff, 0xff], true)]
+#[case(&[0x00, 0xff, 0xff, 0xff], true)]
+#[case(&[0x02, 0x00, 0x00, 0x00], true)]
+#[case(&[0x03, 0xff, 0xff, 0xff], true)]
+// too big
+#[case(&[0x04, 0x00, 0x00, 0x00], false)]
 fn test_auto_small_number_from_buf(#[case] buf: &[u8], #[case] expect_small: bool) {
     let mut a = Allocator::new();
     let atom = a.new_atom(buf).expect("new_atom()");
@@ -1826,3 +1816,35 @@ fn test_auto_small_number_from_buf(#[case] buf: &[u8], #[case] expect_small: boo
     }
     assert_eq!(buf, a.atom(atom).as_ref());
 }
+
+#[cfg(test)]
+#[rstest]
+// redundant leading zeros are not canoncial
+#[case(&[0x00], None)]
+#[case(&[0x00, 0x7f], None)]
+// negative numbers cannot be small ints
+#[case(&[0x80], None)]
+#[case(&[0xff], None)]
+// redundant leading 0xff are still negative
+#[case(&[0xff, 0xff], None)]
+#[case(&[0x80, 0xff, 0xff], None)]
+// to big
+#[case(&[0x04, 0x00, 0x00, 0x00], None)]
+#[case(&[0x05, 0x00, 0x00, 0x00], None)]
+#[case(&[0x04, 0x00, 0x00, 0x00, 0x00], None)]
+// small positive intergers can be small
+#[case(&[0x01], Some(0x01))]
+#[case(&[0x00, 0x80], Some(0x80))]
+#[case(&[0x00, 0xff], Some(0xff))]
+#[case(&[0x7f, 0xff], Some(0x7fff))]
+#[case(&[0x00, 0x80, 0x00], Some(0x8000))]
+#[case(&[0x00, 0xff, 0xff], Some(0xffff))]
+#[case(&[0x7f, 0xff, 0xff], Some(0x7fffff))]
+#[case(&[0x00, 0x80, 0x00, 0x00], Some(0x800000))]
+#[case(&[0x00, 0xff, 0xff, 0xff], Some(0xffffff))]
+#[case(&[0x02, 0x00, 0x00, 0x00], Some(0x2000000))]
+#[case(&[0x03, 0x00, 0x00, 0x00], Some(0x3000000))]
+#[case(&[0x03, 0xff, 0xff, 0xff], Some(0x3ffffff))]
+fn test_fits_in_small_atom(#[case] buf: &[u8], #[case] expected: Option<u32>) {
+    assert_eq!(fits_in_small_atom(buf), expected);
+}