Add JAR fingerprinting (#3)

fossas · Jun 5, 2024 · db8c366 · db8c366
1 parent 9ccea79
commit db8c366
Show file tree

Hide file tree

Showing 22 changed files with 5,827 additions and 715 deletions.
diff --git a/.config/nextest.toml b/.config/nextest.toml
@@ -0,0 +1,6 @@
+[profile.default]
+
+[[profile.default.overrides]]
+# Tests that rely on docker retry since they rely on external resources.
+filter = 'test(docker)'
+retries = { backoff = "exponential", count = 5, delay = "5s", jitter = true }
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,4 @@
+*.tar.gz filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.jar filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/check-dynamic.yml b/.github/workflows/check-dynamic.yml
@@ -8,25 +8,36 @@ jobs:
         settings:
         - host: windows-latest
           setup: echo "no setup"
-          build: cargo build
+          build: cargo build --release
+          test: cargo nextest run --release
         - host: ubuntu-latest
           setup: echo "no setup"
-          build: cargo build
+          build: cargo build --release
+          test: cargo nextest run --release --features ci
         - host: macos-latest
-          setup: rustup target add aarch64-apple-darwin && rustup target add x86_64-apple-darwin
-          build: cargo build --target aarch64-apple-darwin && cargo build --target x86_64-apple-darwin
+          setup: |
+            rustup target add aarch64-apple-darwin &&
+            rustup target add x86_64-apple-darwin
+          build: |
+            cargo build --release --target aarch64-apple-darwin &&
+            cargo build --release --target x86_64-apple-darwin
+          test: |
+            cargo nextest run --target aarch64-apple-darwin --release &&
+            cargo nextest run --target x86_64-apple-darwin --release
 
     runs-on: ${{ matrix.settings.host }}
     name: test / ${{ matrix.settings.host }}
     steps:
     - uses: actions/checkout@v4
+      with:
+        lfs: true
     - uses: dtolnay/rust-toolchain@stable
     - uses: taiki-e/install-action@nextest
     - uses: Swatinem/[email protected]
       with:
         key: ${{ matrix.settings.host }}
     - run: ${{ matrix.settings.setup }}
     - run: ${{ matrix.settings.build }}
-    - run: cargo check --release --all --bins --examples --tests
-    - run: cargo nextest run --all-targets
+    - run: ${{ matrix.settings.test }}
+    - run: cargo check --all --bins --examples --tests
     - run: cargo test --doc
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,8 @@
+
+# v2.0.0
+
+Refactored to the new Sparkle-based view of fingerprints.
+
+# v1.0.0
+
+Initial version, built for VSI specifically.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,15 +1,56 @@
 [package]
 name = "fingerprint"
-version = "1.0.1"
+version = "2.0.0"
 edition = "2021"
 
+[features]
+default = ["fp-content-serialize-hex"]
+
+# Serializes fingerprint content using hex encoding.
+# This is the default encoding since it is what most services expect.
+fp-content-serialize-hex = ["hex"]
+
+# Serializes fingerprint content using base64 encoding.
+#
+# If this is specified along with `fp-content-serialize-hex`,
+# the standard serde operations prefer `fp-content-serialize-hex`.
+fp-content-serialize-base64 = ["base64"]
+
+# Enable full CI tests.
+ci = ["docker-tests"]
+
+# Enable tests which require docker.
+#
+# In CI, this requires Linux;
+# GitHub action runners for macOS and Windows don't have Docker.
+docker-tests = []
+
+# Dependency features, auto added.
+base64 = ["dep:base64"]
+hex = ["dep:hex"]
+
 [dependencies]
 getset = "0.1.2"
-hex = "0.4.3"
 iter-read = "0.3.1"
 serde = { version = "1.0.140", features = ["derive"] }
 thiserror = "1.0.31"
 sha2 = "0.10.6"
+hex = { version = "0.4.3", optional = true }
+base64 = { version = "0.22.1", optional = true }
+zip = "2.1.2"
+strum = { version = "0.26.2", features = ["derive"] }
+alphanumeric-sort = "1.5.3"
+tap = "1.0.1"
+tracing = "0.1.40"
 
 [dev-dependencies]
-typed-builder = "0.10.0"
+pretty_assertions = "1.4.0"
+serde = { version = "1.0.140", features = ["derive"] }
+serde_json = "1.0.117"
+tar = "0.4.40"
+test-log = { version = "0.2.16", features = ["trace"] }
+maplit = "1.0.2"
+xshell = "0.2.6"
+tempfile = "3.10.1"
+lazy-regex = "3.1.0"
+itertools = "0.13.0"
diff --git a/README.md b/README.md
@@ -1 +1,15 @@
 # lib-fingerprint
+
+A fingerprint is a unique identifier for a file's contents.
+
+Fingerprints come in multiple "kinds", which are represented by textual identifiers.
+Fingerprints themselves are represented as binary blobs.
+
+Fingerprint kinds MUST maintain exact implementation compatibility; once the algorithm for a given kind
+has been created and its fingerprints have been crawled, it can't be changed. If a change is needed,
+that has to be a new kind of fingerprint.
+
+This rule means that we start out with two kinds that existed prior to this library being created,
+which have specific rules about how to compute the fingerprint, and specific text identifiers.
+
+For more information, refer to the documentation for the types below.
diff --git a/src/fingerprint.rs b/src/fingerprint.rs
@@ -1,230 +1,4 @@
-use std::io::{self, BufRead, BufReader, Cursor, Read, Write};
-
-use iter_read::IterRead;
-use sha2::{Digest, Sha256};
-
-use crate::{stream::ConvertCRLFToLF, CommentStrippedSHA256, Error, Fingerprint, RawSHA256};
-
-/// Fingerprint the file using the [`RawSHA256`] kind.
-pub fn raw<R: BufRead>(stream: &mut R) -> Result<Fingerprint<RawSHA256>, Error> {
-    // Read the start of the stream, and decide whether to treat the rest of the stream as binary based on that.
-    let BinaryCheck { read, is_binary } = content_is_binary(stream)?;
-
-    // Chain the part of the stream already read to evaluate binary along with the rest of the stream.
-    let mut stream = Cursor::new(read).chain(stream);
-    let mut hasher = Sha256::new();
-    if is_binary {
-        content_binary(&mut stream, &mut hasher)?;
-    } else {
-        content_text(&mut stream, &mut hasher)?;
-    }
-
-    Fingerprint::from_digest(hasher)
-}
-
-/// Fingerprint the file using the [`CommentStrippedSHA256`] kind.
-pub fn comment_stripped<R: BufRead>(
-    stream: &mut R,
-) -> Result<Option<Fingerprint<CommentStrippedSHA256>>, Error> {
-    // Read the start of the stream, and decide whether to treat the rest of the stream as binary based on that.
-    let BinaryCheck { read, is_binary } = content_is_binary(stream)?;
-    if is_binary {
-        return Ok(None);
-    }
-
-    // Chain the part of the stream already read to evaluate binary along with the rest of the stream.
-    let mut stream = Cursor::new(read).chain(stream);
-    let mut hasher = Sha256::new();
-    match content_stripped(&mut stream, &mut hasher) {
-        Ok(_) => Some(Fingerprint::from_digest(hasher)).transpose(),
-        Err(err) => {
-            // The `io::Error` type is opaque.
-            // Handle the case of attempting to comment strip a binary file.
-            if err.to_string().to_lowercase().contains("utf-8") {
-                Ok(None)
-            } else {
-                Err(err)
-            }
-        }
-    }
-}
-
-/// The result of checking a file for whether it is binary.
-pub(crate) struct BinaryCheck {
-    pub(crate) read: Vec<u8>,
-    pub(crate) is_binary: bool,
-}
-
-/// Inspect the file to determine if it is binary.
-///
-/// Uses the same method as git: "is there a zero byte in the first 8000 bytes of the file"
-pub(crate) fn content_is_binary<R: Read>(stream: &mut R) -> Result<BinaryCheck, io::Error> {
-    let mut buf = Vec::new();
-    stream.take(8000).read_to_end(&mut buf)?;
-    let is_binary = buf.contains(&0);
-    Ok(BinaryCheck {
-        read: buf,
-        is_binary,
-    })
-}
-
-/// Reads the exact contents of a binary file without modification.
-pub(crate) fn content_binary(stream: &mut impl BufRead, w: &mut impl Write) -> Result<(), Error> {
-    io::copy(stream, w)?;
-    Ok(())
-}
-
-/// Reads text files in a platform independent manner.
-///
-/// Specifically:
-/// - All text encodings are ignored; this function operates on raw bytes.
-/// - `git` implementations on Windows typically check out files with `\r\n` line endings,
-///   while *nix checks them out with `\n`.
-///   To be platform independent, any `\r\n` byte sequences found are converted to a single `\n`.
-pub(crate) fn content_text(stream: &mut impl BufRead, w: &mut impl Write) -> Result<(), Error> {
-    let stream = BufReader::new(stream).bytes().crlf_to_lf().fuse();
-    io::copy(&mut IterRead::new(stream), w)?;
-    Ok(())
-}
-
-/// Hashes code files while removing C-style comments and blank lines in a platform independent manner.
-///
-/// Specifically:
-/// - All text encodings are treated as utf8.
-/// - `git` implementations on Windows typically check out files with `\r\n` line endings,
-///   while *nix checks them out with `\n`.
-///   To be platform independent, any `\r\n` byte sequences found are converted to a single `\n`.
-/// - C-style comments are removed:
-///   - `//` is considered the start of a single line comment; these bytes and any other bytes until right before a `\n` are removed.
-///   - `/*` is considered the start of a multi line comment; these bytes and any other bytes until after a `*/` is read are removed.
-///   - This function does not check for escaped comments.
-/// - Any sequence of multiple contiguous `\n` bytes are collapsed to a single `\n` byte.
-/// - The final `\n` byte is removed from the end of the stream if present.
-pub(crate) fn content_stripped(stream: &mut impl BufRead, w: &mut impl Write) -> Result<(), Error> {
-    let mut buffered_output_line = String::new();
-    let mut is_multiline_active = false;
-
-    for line in stream.lines() {
-        let mut line = line?;
-
-        // At this point we know we have a new line coming. If a previous line is buffered and ready to write, do so now.
-        // Write it with a trailing newline because we know we'll be writing a following line.
-        if !buffered_output_line.is_empty() {
-            writeln!(w, "{buffered_output_line}")?;
-        }
-
-        (line, is_multiline_active) = clean_line(line, is_multiline_active);
-        line.trim().clone_into(&mut buffered_output_line);
-    }
-
-    // Now that we're done reading the input stream, if there's a buffered output line write it *without a trailing newline*.
-    write!(w, "{buffered_output_line}")?;
-    Ok(())
-}
-
-/// Part comment stripping, part state machine. Cleans lines of comments based on whether a previous invocation
-/// detected the start of a multi line comment.
-///
-/// This is very much not an ideal function: it scans the line multiple times instead of being forward-looking-only,
-/// and the dual responsibility makes it complicated. We should fix this, but moving forward for now.
-fn clean_line(line: String, is_multiline_active: bool) -> (String, bool) {
-    if is_multiline_active {
-        if let Some(end) = line.find("*/") {
-            return clean_line(line[end + 2..].to_string(), false);
-        }
-
-        (String::new(), true)
-    } else if let Some(start) = line.find("/*") {
-        let before_multi = line[..start].to_string();
-        let (after_multi, is_multi) = clean_line(line[start + 2..].to_string(), true);
-        (before_multi + &after_multi, is_multi)
-    } else if let Some(start) = line.find("//") {
-        (line[..start].to_string(), false)
-    } else {
-        (line, false)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    //! Tests for internal logic.
-
-    use super::*;
-
-    /// Inspired by the Haskell implementation: https://github.com/fossas/fossa-cli/blob/8de74b71b80d77321d64f94d7573773e49306772/test/App/Fossa/VSI/testdata/multi_line_comment.c#L1-L10
-    #[test]
-    fn comment_strip_mixed() {
-        let content = r#"/*
- * This is a placeholder file used to test comment stripping code.
-*/
-
-int main() {
-  int code = 0;
-  // code = 1;
-
-
-
-
-  return code; // perfect
-}
-"#;
-        let expected = r#"int main() {
-int code = 0;
-return code;
-}"#;
-
-        let mut buf = Vec::new();
-        content_stripped(&mut Cursor::new(content), &mut buf).expect("must fingerprint");
-        assert_eq!(expected, String::from_utf8_lossy(&buf));
-    }
-
-    /// Copied from the Go implementation: https://github.com/fossas/basis/blob/6b0a1ce7ca5d88d033732f6dcfebd90b8f143038/sherlock/pkg/lib/indexer/cleaned/strip_comments_internal_test.go#L71-L79
-    #[test]
-    fn comment_strip_single_line_comments() {
-        let content = " content1 \n content2 //comment \n content3 ";
-        let expected = "content1\ncontent2\ncontent3";
-
-        let mut buf = Vec::new();
-        content_stripped(&mut Cursor::new(content), &mut buf).expect("must fingerprint");
-        assert_eq!(expected, String::from_utf8_lossy(&buf));
-    }
-
-    /// Copied from the Go implementation: https://github.com/fossas/basis/blob/6b0a1ce7ca5d88d033732f6dcfebd90b8f143038/sherlock/pkg/lib/indexer/cleaned/strip_comments_internal_test.go#L89-L97
-    #[test]
-    fn comment_strip_multi_line_comments() {
-        let content =
-            " content1 \n  content2 /* begin comment \n end comment */ content3 \n content4 ";
-        let expected = "content1\ncontent2\ncontent3\ncontent4";
-
-        let mut buf = Vec::new();
-        content_stripped(&mut Cursor::new(content), &mut buf).expect("must fingerprint");
-        assert_eq!(expected, String::from_utf8_lossy(&buf));
-    }
-
-    #[test]
-    fn comment_strip_cr() {
-        let content = "hello world\r\nanother line\r\na final line\n";
-        let expected = "hello world\nanother line\na final line";
-
-        let mut buf = Vec::new();
-        content_stripped(&mut Cursor::new(content), &mut buf).expect("must fingerprint");
-        assert_eq!(expected, String::from_utf8_lossy(&buf));
-    }
-
-    #[test]
-    fn comment_strip_real_source() {
-        let content = include_bytes!("../testdata/facebook-folly-Version.cpp");
-        let expected = include_str!("../testdata/facebook-folly-Version.cpp.stripped");
-
-        let mut buf = Vec::new();
-        content_stripped(&mut Cursor::new(content), &mut buf).expect("must process");
-
-        assert_eq!(normalize_lf(expected), String::from_utf8_lossy(&buf));
-    }
-
-    /// Windows CI checks out CRLF. Normalize it to be LF only.
-    /// This function should only be applied to testing values, not responses from the functions being tested.
-    fn normalize_lf(input: impl Into<String>) -> String {
-        input.into().replace("\r\n", "\n")
-    }
-}
+pub mod binary;
+pub mod bytes;
+pub mod jar;
+pub mod text;