Skip to content

Commit

Permalink
perf: validate using threads
Browse files Browse the repository at this point in the history
  • Loading branch information
baszalmstra committed Nov 15, 2024
1 parent 19cd7d5 commit c856265
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 50 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ plist = "1"
purl = { version = "0.1.3", features = ["serde"] }
quote = "1.0.37"
rand = "0.8.5"
rayon = "1.10.0"
reflink-copy = "0.1.19"
regex = "1.11.1"
reqwest = { version = "0.12.9", default-features = false }
Expand Down
1 change: 1 addition & 0 deletions crates/rattler_cache/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ reqwest-middleware.workspace = true
digest.workspace = true
fs4 = { workspace = true, features = ["fs-err-tokio"] }
simple_spawn_blocking = { version = "1.0.0", path = "../simple_spawn_blocking", features = ["tokio"] }
rayon = { workspace = true }

[dev-dependencies]
assert_matches.workspace = true
Expand Down
111 changes: 61 additions & 50 deletions crates/rattler_cache/src/validation.rs
Original file line number Diff line number Diff line change
@@ -1,25 +1,28 @@
//! Functionality to validate the contents of a Conda package.
//!
//! Almost all Conda packages contain a file `info/paths.json` that describes all the files the
//! package contains. The [`validate_package_directory`] function validates that a directory
//! containing an extracted Conda package archive actually contains the files as described by the
//! `paths.json` file.
//! Almost all Conda packages contain a file `info/paths.json` that describes
//! all the files the package contains. The [`validate_package_directory`]
//! function validates that a directory containing an extracted Conda package
//! archive actually contains the files as described by the `paths.json` file.
//!
//! Very old Conda packages do not contain a `paths.json` file. These packages contain a
//! (deprecated) `files` file as well as optionally a `has_prefix` and some other files. If the
//! `paths.json` file is missing these deprecated files are used instead to reconstruct a
//! [`PathsJson`] object. See [`PathsJson::from_deprecated_package_directory`] for more information.
//! Very old Conda packages do not contain a `paths.json` file. These packages
//! contain a (deprecated) `files` file as well as optionally a `has_prefix` and
//! some other files. If the `paths.json` file is missing these deprecated files
//! are used instead to reconstruct a [`PathsJson`] object. See
//! [`PathsJson::from_deprecated_package_directory`] for more information.

use digest::Digest;
use rattler_conda_types::package::{IndexJson, PackageFile, PathType, PathsEntry, PathsJson};
use rattler_digest::Sha256;
use std::{
io::ErrorKind,
io::{BufReader, ErrorKind},
path::{Path, PathBuf},
};

/// An error that is returned by [`validate_package_directory`] if the contents of the directory seems to be
/// corrupted.
use digest::Digest;
use rattler_conda_types::package::{IndexJson, PackageFile, PathType, PathsEntry, PathsJson};
use rattler_digest::Sha256;
use rayon::iter::{IntoParallelRefIterator, ParallelIterator};

/// An error that is returned by [`validate_package_directory`] if the contents
/// of the directory seems to be corrupted.
#[derive(Debug, thiserror::Error)]
pub enum PackageValidationError {
/// Neither a `paths.json` file nor a deprecated `files` file was found.
Expand All @@ -43,7 +46,8 @@ pub enum PackageValidationError {
ReadIndexJsonError(#[source] std::io::Error),
}

/// An error that indicates that a specific file in a package archive directory seems to be corrupted.
/// An error that indicates that a specific file in a package archive directory
/// seems to be corrupted.
#[derive(Debug, thiserror::Error)]
pub enum PackageEntryValidationError {
/// An error occurred while reading the metadata of the file.
Expand Down Expand Up @@ -75,24 +79,25 @@ pub enum PackageEntryValidationError {
HashMismatch(String, String),
}

/// Determine whether the files in the specified directory match what is expected according to the
/// `info/paths.json` file in the same directory.
/// Determine whether the files in the specified directory match what is
/// expected according to the `info/paths.json` file in the same directory.
///
/// If the `info/paths.json` file could not be found this function tries to reconstruct the
/// information from older deprecated methods. See [`PathsJson::from_deprecated_package_directory`].
/// If the `info/paths.json` file could not be found this function tries to
/// reconstruct the information from older deprecated methods. See
/// [`PathsJson::from_deprecated_package_directory`].
///
/// If validation succeeds the parsed [`PathsJson`] object is returned which contains information
/// about the files in the archive.
/// If validation succeeds the parsed [`PathsJson`] object is returned which
/// contains information about the files in the archive.
pub fn validate_package_directory(
package_dir: &Path,
) -> Result<(IndexJson, PathsJson), PackageValidationError> {
// Validate that there is a valid IndexJson
let index_json = IndexJson::from_package_directory(package_dir)
.map_err(PackageValidationError::ReadIndexJsonError)?;

// Read the 'paths.json' file which describes all files that should be present. If the file
// could not be found try reconstructing the paths information from deprecated files in the
// package directory.
// Read the 'paths.json' file which describes all files that should be present.
// If the file could not be found try reconstructing the paths information
// from deprecated files in the package directory.
let paths = match PathsJson::from_package_directory(package_dir) {
Err(e) if e.kind() == ErrorKind::NotFound => {
match PathsJson::from_deprecated_package_directory(package_dir) {
Expand All @@ -114,21 +119,20 @@ pub fn validate_package_directory(
Ok((index_json, paths))
}

/// Determine whether the files in the specified directory match wat is expected according to the
/// passed in [`PathsJson`].
/// Determine whether the files in the specified directory match wat is expected
/// according to the passed in [`PathsJson`].
pub fn validate_package_directory_from_paths(
package_dir: &Path,
paths: &PathsJson,
) -> Result<(), (PathBuf, PackageEntryValidationError)> {
// Check every entry in the PathsJson object
for entry in paths.paths.iter() {
validate_package_entry(package_dir, entry).map_err(|e| (entry.relative_path.clone(), e))?;
}

Ok(())
paths.paths.par_iter().try_for_each(|entry| {
validate_package_entry(package_dir, entry).map_err(|e| (entry.relative_path.clone(), e))
})
}

/// Determine whether the information in the [`PathsEntry`] matches the file in the package directory.
/// Determine whether the information in the [`PathsEntry`] matches the file in
/// the package directory.
fn validate_package_entry(
package_dir: &Path,
entry: &PathsEntry,
Expand All @@ -143,7 +147,8 @@ fn validate_package_entry(
}
}

/// Determine whether the information in the [`PathsEntry`] matches the file at the specified path.
/// Determine whether the information in the [`PathsEntry`] matches the file at
/// the specified path.
fn validate_package_hard_link_entry(
path: PathBuf,
entry: &PathsEntry,
Expand Down Expand Up @@ -184,6 +189,7 @@ fn validate_package_hard_link_entry(
// Check the SHA256 hash of the file
if let Some(expected_hash) = &entry.sha256 {
// Determine the hash of the file on disk
let mut file = BufReader::with_capacity(64 * 1024, file);
let mut hasher = Sha256::default();
std::io::copy(&mut file, &mut hasher)?;
let hash = hasher.finalize();
Expand All @@ -200,8 +206,8 @@ fn validate_package_hard_link_entry(
Ok(())
}

/// Determine whether the information in the [`PathsEntry`] matches the symbolic link at the specified
/// path.
/// Determine whether the information in the [`PathsEntry`] matches the symbolic
/// link at the specified path.
fn validate_package_soft_link_entry(
path: PathBuf,
entry: &PathsEntry,
Expand All @@ -212,15 +218,17 @@ fn validate_package_soft_link_entry(
return Err(PackageEntryValidationError::ExpectedSymlink);
}

// TODO: Validate symlink content. Dont validate the SHA256 hash of the file because since a
// symlink will most likely point to another file added as a hardlink by the package this is
// double work. Instead check that the symlink is correct e.g. `../a` points to the same file as
// `b/../../a` but they are different.
// TODO: Validate symlink content. Dont validate the SHA256 hash of the file
// because since a symlink will most likely point to another file added as a
// hardlink by the package this is double work. Instead check that the
// symlink is correct e.g. `../a` points to the same file as `b/../../a` but
// they are different.

Ok(())
}

/// Determine whether the information in the [`PathsEntry`] matches the directory at the specified path.
/// Determine whether the information in the [`PathsEntry`] matches the
/// directory at the specified path.
fn validate_package_directory_entry(
path: PathBuf,
entry: &PathsEntry,
Expand All @@ -236,16 +244,18 @@ fn validate_package_directory_entry(

#[cfg(test)]
mod test {
use super::{
validate_package_directory, validate_package_directory_from_paths,
PackageEntryValidationError, PackageValidationError,
};
use std::io::Write;

use assert_matches::assert_matches;
use rattler_conda_types::package::{PackageFile, PathType, PathsJson};
use rstest::rstest;
use std::io::Write;
use url::Url;

use super::{
validate_package_directory, validate_package_directory_from_paths,
PackageEntryValidationError, PackageValidationError,
};

#[rstest]
#[case::conda(
"https://conda.anaconda.org/conda-forge/win-64/conda-22.9.0-py38haa244fe_2.tar.bz2",
Expand All @@ -270,8 +280,8 @@ mod test {

rattler_package_streaming::fs::extract(&package_path, temp_dir.path()).unwrap();

// Validate that the extracted package is correct. Since it's just been extracted this should
// work.
// Validate that the extracted package is correct. Since it's just been
// extracted this should work.
let result = validate_package_directory(temp_dir.path());
if let Err(e) = result {
panic!("{e}");
Expand All @@ -295,7 +305,8 @@ mod test {
file.write_all(&[255]).unwrap();
drop(file);

// Revalidate the package, given that we changed a file it should now fail with mismatched hashes.
// Revalidate the package, given that we changed a file it should now fail with
// mismatched hashes.
assert_matches!(
validate_package_directory_from_paths(temp_dir.path(), &paths),
Err((
Expand Down Expand Up @@ -323,8 +334,8 @@ mod test {

rattler_package_streaming::fs::extract(&package_path, temp_dir.path()).unwrap();

// Validate that the extracted package is correct. Since it's just been extracted this should
// work.
// Validate that the extracted package is correct. Since it's just been
// extracted this should work.
let result = validate_package_directory(temp_dir.path());
if let Err(e) = result {
panic!("{e}");
Expand Down

0 comments on commit c856265

Please sign in to comment.