Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Maven Central fingerprint kind #6

Merged
merged 4 commits into from
Jun 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@

# v2.1.0

- Adds new `Kind::JarMavenCentralV1` fingerprint.
- Attempts to improve performance of file-based fingerprinting by paralellizing across threads.
- This was done now that we have several kinds of fingerprints, and we'll probably just keep adding more.

# v2.0.0

Refactored to the new Sparkle-based view of fingerprints.
Expand Down
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fingerprint"
version = "2.0.0"
version = "2.1.0"
edition = "2021"

[features]
Expand Down Expand Up @@ -42,6 +42,7 @@ strum = { version = "0.26.2", features = ["derive"] }
alphanumeric-sort = "1.5.3"
tap = "1.0.1"
tracing = "0.1.40"
sha1 = "0.10.6"

[dev-dependencies]
pretty_assertions = "1.4.0"
Expand Down
10 changes: 10 additions & 0 deletions src/fingerprint/jar.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::io::{BufRead, Seek};

use sha1::Sha1;
use sha2::{Digest, Sha256};
use tap::Pipe;
use tracing::warn;
Expand All @@ -22,6 +23,15 @@ pub fn raw(stream: impl BufRead + Seek) -> Result<Option<Fingerprint>, Error> {
}
}

/// Fingerprint the java archive the same way as Maven Central.
#[tracing::instrument(level = tracing::Level::DEBUG, skip_all, ret)]
pub fn maven_central(mut stream: impl BufRead + Seek) -> Result<Option<Fingerprint>, Error> {
let mut hasher = Sha1::new();
std::io::copy(&mut stream, &mut hasher)?;
let content = Content::from_digest(hasher);
Ok(Some(Fingerprint::new(Kind::JarMavenCentralV1, content)))
}

/// Fingerprint class files inside a java archive (a JAR).
#[tracing::instrument(level = tracing::Level::DEBUG, skip_all, ret)]
pub fn class(stream: impl BufRead + Seek) -> Result<Option<Fingerprint>, Error> {
Expand Down
59 changes: 53 additions & 6 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@ use std::{
fs::File,
io::{BufRead, BufReader, Cursor, Seek},
path::Path,
thread::ScopedJoinHandle,
};

use getset::Getters;
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use strum::{AsRefStr, Display, EnumIter, IntoEnumIterator, VariantNames};
use tap::Pipe;
use thiserror::Error;

mod fingerprint;
Expand Down Expand Up @@ -123,6 +125,15 @@ pub enum Kind {
#[strum(serialize = "v1.raw.jar")]
JarRawV1,

/// Represents a fingerprint derived by hashing the raw contents of a JAR file in the same manner
/// as Maven Central. The idea is that such fingerprints can then be looked up via the
/// Maven Central REST API as a fallback to our own indexing.
///
/// Specifically:
/// - The content of the JAR file is hashed as-is using the sha1 algorithm.
#[strum(serialize = "v1.mavencentral.jar")]
JarMavenCentralV1,

/// Represents a fingerprint derived by hashing the raw contents of a JAR file with the SHA256 algorithm
/// in a platform-independent manner.
///
Expand Down Expand Up @@ -162,7 +173,7 @@ impl<'de> Deserialize<'de> for Kind {
}

/// An array of bytes representing a fingerprint's content.
#[derive(Clone, Eq, PartialEq, Hash, Default)]
#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Default)]
pub struct Content(Vec<u8>);

impl Content {
Expand Down Expand Up @@ -323,6 +334,7 @@ impl Fingerprint {
Kind::CommentStrippedSha256 => fingerprint::text::comment_stripped(stream),
Kind::JarRawV1 => fingerprint::jar::raw(stream),
Kind::JarClassV1 => fingerprint::jar::class(stream),
Kind::JarMavenCentralV1 => fingerprint::jar::maven_central(stream),
}
}

Expand Down Expand Up @@ -390,7 +402,11 @@ impl<K: Into<Kind>, C: Into<Content>> From<(K, C)> for Fingerprint {
pub struct Combined(HashMap<Kind, Content>);

impl Combined {
/// Fingerprint the provided stream (typically a file handle) with all fingerprint [`Kind`]s.
/// Fingerprint the provided stream with all fingerprint [`Kind`]s.
///
/// Note: this forces fingerprinting to be performed serially
/// since the stream has to be seeked backwards for each fingerprinter;
/// if this is not desired consider [`Combined::from_file`] or [`Combined::from_buffer`].
#[tracing::instrument(level = tracing::Level::DEBUG, skip_all, ret)]
pub fn from_stream(mut stream: impl BufRead + Seek) -> Result<Self, Error> {
let mut fingerprints = Vec::new();
Expand All @@ -407,10 +423,22 @@ impl Combined {
}

/// Fingerprint the provided file with all fingerprint [`Kind`]s.
///
/// Note: this opens the file multiple times, once for each kind of fingerprint,
/// then runs each fingerprinter in its own thread.
/// If this is not desired consider [`Combined::from_stream`] or [`Combined::from_buffer`].
#[tracing::instrument(level = tracing::Level::DEBUG, ret)]
pub fn from_file(path: &Path) -> Result<Self, Error> {
let mut file = BufReader::new(File::open(path)?);
Self::from_stream(&mut file)
std::thread::scope(|scope| {
let handles = Kind::iter()
.map(|kind| scope.spawn(move || Fingerprint::from_file(kind, path)))
.collect::<Vec<_>>();

match collapse_handles(handles) {
Ok(fps) => fps.into_iter().flatten().pipe(Combined::from).pipe(Ok),
Err(err) => Err(err),
}
})
}

/// Fingerprint the provided buffer with all fingerprint [`Kind`]s.
Expand All @@ -425,8 +453,13 @@ impl Combined {
/// of errors in the future it isn't a breaking change.
#[tracing::instrument(level = tracing::Level::DEBUG, fields(buf = %buf.as_ref().len()), ret)]
pub fn from_buffer(buf: impl AsRef<[u8]>) -> Result<Self, Error> {
let mut content = Cursor::new(buf);
Self::from_stream(&mut content)
Kind::iter()
.map(|kind| Fingerprint::from_buffer(kind, buf.as_ref()))
.collect::<Result<Vec<_>, _>>()?
.into_iter()
.flatten()
.pipe(Combined::from)
.pipe(Ok)
}

/// Create a new instance from a single fingerprint.
Expand Down Expand Up @@ -462,3 +495,17 @@ impl<I: IntoIterator<Item = F>, F: Into<Fingerprint>> From<I> for Combined {
)
}
}

fn collapse_handles<T, E>(handles: Vec<ScopedJoinHandle<'_, Result<T, E>>>) -> Result<Vec<T>, E> {
let mut collected = Vec::new();
for handle in handles {
match handle.join() {
Err(err) => std::panic::resume_unwind(err),
Ok(operation) => match operation {
Ok(inner) => collected.push(inner),
Err(err) => return Err(err),
},
}
}
Ok(collected)
}
16 changes: 7 additions & 9 deletions tests/it/code_vsi.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
//! Tests for plain code files using legacy VSI fingerprints.

use std::io::Cursor;

use pretty_assertions::assert_eq;

use fingerprint::*;
Expand All @@ -11,7 +9,7 @@ use fingerprint::*;
///
/// ```ignore
/// let content = b"hello world";
/// let combined = Combined::from_stream(&mut Cursor::new(content)).expect("fingerprint");
/// let combined = Combined::from_buffer(&content).expect("fingerprint");
/// assert_fingerprint_eq!(Kind::RawSha256, content, combined);
/// assert_fingerprint_eq!(Kind::CommentStrippedSha256, content, combined);
/// ```
Expand Down Expand Up @@ -69,15 +67,15 @@ fn combined_getters() {
#[test]
fn fingerprints_binary_file() {
let content = vec![1, 2, 3, 0, 1, 2, 3];
let combined = Combined::from_stream(&mut Cursor::new(content.clone())).expect("fingerprint");
let combined = Combined::from_buffer(&content).expect("fingerprint");
assert_fingerprint_eq!(Kind::RawSha256, &content, combined);
assert_fingerprint_eq!(Kind::CommentStrippedSha256, None, combined);
}

#[test]
fn fingerprints_text_file() {
let content = b"hello world";
let combined = Combined::from_stream(&mut Cursor::new(content)).expect("fingerprint");
let combined = Combined::from_buffer(content).expect("fingerprint");
assert_fingerprint_eq!(Kind::RawSha256, content, combined);
assert_fingerprint_eq!(Kind::CommentStrippedSha256, content, combined);
}
Expand All @@ -88,7 +86,7 @@ fn fingerprints_text_file_stripping_cr() {
let content_cs = b"hello world\nanother line\na final line";
let without_cr = b"hello world\nanother line\na final line\n";

let combined = Combined::from_stream(&mut Cursor::new(content)).expect("fingerprint");
let combined = Combined::from_buffer(content).expect("fingerprint");
assert_fingerprint_eq!(Kind::RawSha256, without_cr, combined);
assert_fingerprint_eq!(Kind::CommentStrippedSha256, content_cs, combined);
}
Expand All @@ -97,15 +95,15 @@ fn fingerprints_text_file_stripping_cr() {
fn fingerprints_binary_file_appearing_as_text() {
// Sourced from `[email protected]:chromium/chromium.git` at `tools/origin_trials/eftest.key` on commit 49249345609d505c8bb8b0b5a42ff4b68b9e6d41.
let content = include_bytes!("../../testdata/eftest.key");
let combined = Combined::from_stream(&mut Cursor::new(content)).expect("fingerprint");
let combined = Combined::from_buffer(content).expect("fingerprint");
assert_fingerprint_eq!(Kind::RawSha256, content, combined);
assert_fingerprint_eq!(Kind::CommentStrippedSha256, None, combined);
}

#[test]
fn comment_stripped_does_not_fingerprint_binary_file() {
let content = vec![1, 2, 3, 0, 1, 2, 3];
let combined = Combined::from_stream(&mut Cursor::new(content)).expect("fingerprint");
let combined = Combined::from_buffer(content).expect("fingerprint");
assert_fingerprint_eq!(Kind::CommentStrippedSha256, None, combined);
}

Expand All @@ -123,7 +121,7 @@ int main() {
}
"#;

let combined = Combined::from_stream(&mut Cursor::new(content)).expect("fingerprint");
let combined = Combined::from_buffer(content).expect("fingerprint");
let expected = Content::new(
hex::decode("44fc8f68ab633c7ca0240a66e4ff038c0f2412fe69d14b6f052556edaa1b9160")
.expect("decode hex literal"),
Expand Down
Loading