Skip to content

Commit

Permalink
Maven Central fingerprint kind (#6)
Browse files Browse the repository at this point in the history
  • Loading branch information
jssblck authored Jun 17, 2024
1 parent 7b34840 commit 7377867
Show file tree
Hide file tree
Showing 9 changed files with 698 additions and 43 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@

# v2.1.0

- Adds new `Kind::JarMavenCentralV1` fingerprint.
- Attempts to improve performance of file-based fingerprinting by paralellizing across threads.
- This was done now that we have several kinds of fingerprints, and we'll probably just keep adding more.

# v2.0.0

Refactored to the new Sparkle-based view of fingerprints.
Expand Down
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fingerprint"
version = "2.0.0"
version = "2.1.0"
edition = "2021"

[features]
Expand Down Expand Up @@ -42,6 +42,7 @@ strum = { version = "0.26.2", features = ["derive"] }
alphanumeric-sort = "1.5.3"
tap = "1.0.1"
tracing = "0.1.40"
sha1 = "0.10.6"

[dev-dependencies]
pretty_assertions = "1.4.0"
Expand Down
10 changes: 10 additions & 0 deletions src/fingerprint/jar.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::io::{BufRead, Seek};

use sha1::Sha1;
use sha2::{Digest, Sha256};
use tap::Pipe;
use tracing::warn;
Expand All @@ -22,6 +23,15 @@ pub fn raw(stream: impl BufRead + Seek) -> Result<Option<Fingerprint>, Error> {
}
}

/// Fingerprint the java archive the same way as Maven Central.
#[tracing::instrument(level = tracing::Level::DEBUG, skip_all, ret)]
pub fn maven_central(mut stream: impl BufRead + Seek) -> Result<Option<Fingerprint>, Error> {
let mut hasher = Sha1::new();
std::io::copy(&mut stream, &mut hasher)?;
let content = Content::from_digest(hasher);
Ok(Some(Fingerprint::new(Kind::JarMavenCentralV1, content)))
}

/// Fingerprint class files inside a java archive (a JAR).
#[tracing::instrument(level = tracing::Level::DEBUG, skip_all, ret)]
pub fn class(stream: impl BufRead + Seek) -> Result<Option<Fingerprint>, Error> {
Expand Down
59 changes: 53 additions & 6 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@ use std::{
fs::File,
io::{BufRead, BufReader, Cursor, Seek},
path::Path,
thread::ScopedJoinHandle,
};

use getset::Getters;
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use strum::{AsRefStr, Display, EnumIter, IntoEnumIterator, VariantNames};
use tap::Pipe;
use thiserror::Error;

mod fingerprint;
Expand Down Expand Up @@ -123,6 +125,15 @@ pub enum Kind {
#[strum(serialize = "v1.raw.jar")]
JarRawV1,

/// Represents a fingerprint derived by hashing the raw contents of a JAR file in the same manner
/// as Maven Central. The idea is that such fingerprints can then be looked up via the
/// Maven Central REST API as a fallback to our own indexing.
///
/// Specifically:
/// - The content of the JAR file is hashed as-is using the sha1 algorithm.
#[strum(serialize = "v1.mavencentral.jar")]
JarMavenCentralV1,

/// Represents a fingerprint derived by hashing the raw contents of a JAR file with the SHA256 algorithm
/// in a platform-independent manner.
///
Expand Down Expand Up @@ -162,7 +173,7 @@ impl<'de> Deserialize<'de> for Kind {
}

/// An array of bytes representing a fingerprint's content.
#[derive(Clone, Eq, PartialEq, Hash, Default)]
#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Default)]
pub struct Content(Vec<u8>);

impl Content {
Expand Down Expand Up @@ -323,6 +334,7 @@ impl Fingerprint {
Kind::CommentStrippedSha256 => fingerprint::text::comment_stripped(stream),
Kind::JarRawV1 => fingerprint::jar::raw(stream),
Kind::JarClassV1 => fingerprint::jar::class(stream),
Kind::JarMavenCentralV1 => fingerprint::jar::maven_central(stream),
}
}

Expand Down Expand Up @@ -390,7 +402,11 @@ impl<K: Into<Kind>, C: Into<Content>> From<(K, C)> for Fingerprint {
pub struct Combined(HashMap<Kind, Content>);

impl Combined {
/// Fingerprint the provided stream (typically a file handle) with all fingerprint [`Kind`]s.
/// Fingerprint the provided stream with all fingerprint [`Kind`]s.
///
/// Note: this forces fingerprinting to be performed serially
/// since the stream has to be seeked backwards for each fingerprinter;
/// if this is not desired consider [`Combined::from_file`] or [`Combined::from_buffer`].
#[tracing::instrument(level = tracing::Level::DEBUG, skip_all, ret)]
pub fn from_stream(mut stream: impl BufRead + Seek) -> Result<Self, Error> {
let mut fingerprints = Vec::new();
Expand All @@ -407,10 +423,22 @@ impl Combined {
}

/// Fingerprint the provided file with all fingerprint [`Kind`]s.
///
/// Note: this opens the file multiple times, once for each kind of fingerprint,
/// then runs each fingerprinter in its own thread.
/// If this is not desired consider [`Combined::from_stream`] or [`Combined::from_buffer`].
#[tracing::instrument(level = tracing::Level::DEBUG, ret)]
pub fn from_file(path: &Path) -> Result<Self, Error> {
let mut file = BufReader::new(File::open(path)?);
Self::from_stream(&mut file)
std::thread::scope(|scope| {
let handles = Kind::iter()
.map(|kind| scope.spawn(move || Fingerprint::from_file(kind, path)))
.collect::<Vec<_>>();

match collapse_handles(handles) {
Ok(fps) => fps.into_iter().flatten().pipe(Combined::from).pipe(Ok),
Err(err) => Err(err),
}
})
}

/// Fingerprint the provided buffer with all fingerprint [`Kind`]s.
Expand All @@ -425,8 +453,13 @@ impl Combined {
/// of errors in the future it isn't a breaking change.
#[tracing::instrument(level = tracing::Level::DEBUG, fields(buf = %buf.as_ref().len()), ret)]
pub fn from_buffer(buf: impl AsRef<[u8]>) -> Result<Self, Error> {
let mut content = Cursor::new(buf);
Self::from_stream(&mut content)
Kind::iter()
.map(|kind| Fingerprint::from_buffer(kind, buf.as_ref()))
.collect::<Result<Vec<_>, _>>()?
.into_iter()
.flatten()
.pipe(Combined::from)
.pipe(Ok)
}

/// Create a new instance from a single fingerprint.
Expand Down Expand Up @@ -462,3 +495,17 @@ impl<I: IntoIterator<Item = F>, F: Into<Fingerprint>> From<I> for Combined {
)
}
}

fn collapse_handles<T, E>(handles: Vec<ScopedJoinHandle<'_, Result<T, E>>>) -> Result<Vec<T>, E> {
let mut collected = Vec::new();
for handle in handles {
match handle.join() {
Err(err) => std::panic::resume_unwind(err),
Ok(operation) => match operation {
Ok(inner) => collected.push(inner),
Err(err) => return Err(err),
},
}
}
Ok(collected)
}
16 changes: 7 additions & 9 deletions tests/it/code_vsi.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
//! Tests for plain code files using legacy VSI fingerprints.
use std::io::Cursor;

use pretty_assertions::assert_eq;

use fingerprint::*;
Expand All @@ -11,7 +9,7 @@ use fingerprint::*;
///
/// ```ignore
/// let content = b"hello world";
/// let combined = Combined::from_stream(&mut Cursor::new(content)).expect("fingerprint");
/// let combined = Combined::from_buffer(&content).expect("fingerprint");
/// assert_fingerprint_eq!(Kind::RawSha256, content, combined);
/// assert_fingerprint_eq!(Kind::CommentStrippedSha256, content, combined);
/// ```
Expand Down Expand Up @@ -69,15 +67,15 @@ fn combined_getters() {
#[test]
fn fingerprints_binary_file() {
let content = vec![1, 2, 3, 0, 1, 2, 3];
let combined = Combined::from_stream(&mut Cursor::new(content.clone())).expect("fingerprint");
let combined = Combined::from_buffer(&content).expect("fingerprint");
assert_fingerprint_eq!(Kind::RawSha256, &content, combined);
assert_fingerprint_eq!(Kind::CommentStrippedSha256, None, combined);
}

#[test]
fn fingerprints_text_file() {
let content = b"hello world";
let combined = Combined::from_stream(&mut Cursor::new(content)).expect("fingerprint");
let combined = Combined::from_buffer(content).expect("fingerprint");
assert_fingerprint_eq!(Kind::RawSha256, content, combined);
assert_fingerprint_eq!(Kind::CommentStrippedSha256, content, combined);
}
Expand All @@ -88,7 +86,7 @@ fn fingerprints_text_file_stripping_cr() {
let content_cs = b"hello world\nanother line\na final line";
let without_cr = b"hello world\nanother line\na final line\n";

let combined = Combined::from_stream(&mut Cursor::new(content)).expect("fingerprint");
let combined = Combined::from_buffer(content).expect("fingerprint");
assert_fingerprint_eq!(Kind::RawSha256, without_cr, combined);
assert_fingerprint_eq!(Kind::CommentStrippedSha256, content_cs, combined);
}
Expand All @@ -97,15 +95,15 @@ fn fingerprints_text_file_stripping_cr() {
fn fingerprints_binary_file_appearing_as_text() {
// Sourced from `[email protected]:chromium/chromium.git` at `tools/origin_trials/eftest.key` on commit 49249345609d505c8bb8b0b5a42ff4b68b9e6d41.
let content = include_bytes!("../../testdata/eftest.key");
let combined = Combined::from_stream(&mut Cursor::new(content)).expect("fingerprint");
let combined = Combined::from_buffer(content).expect("fingerprint");
assert_fingerprint_eq!(Kind::RawSha256, content, combined);
assert_fingerprint_eq!(Kind::CommentStrippedSha256, None, combined);
}

#[test]
fn comment_stripped_does_not_fingerprint_binary_file() {
let content = vec![1, 2, 3, 0, 1, 2, 3];
let combined = Combined::from_stream(&mut Cursor::new(content)).expect("fingerprint");
let combined = Combined::from_buffer(content).expect("fingerprint");
assert_fingerprint_eq!(Kind::CommentStrippedSha256, None, combined);
}

Expand All @@ -123,7 +121,7 @@ int main() {
}
"#;

let combined = Combined::from_stream(&mut Cursor::new(content)).expect("fingerprint");
let combined = Combined::from_buffer(content).expect("fingerprint");
let expected = Content::new(
hex::decode("44fc8f68ab633c7ca0240a66e4ff038c0f2412fe69d14b6f052556edaa1b9160")
.expect("decode hex literal"),
Expand Down
Loading

0 comments on commit 7377867

Please sign in to comment.