From f34e4abc39037fe28b32a9ac3049f9d7f8d5c017 Mon Sep 17 00:00:00 2001 From: konstin Date: Thu, 27 Jun 2024 19:41:55 +0200 Subject: [PATCH] Don't add duplicate files Add a file tracker for writing archives that checks that each path in the archive can't be added more than once. If the same file would be added twice, it's ignored the second time, if a different file would be added, it errors. Fixes #2066 Fixes #2106 --- Cargo.lock | 1 + Cargo.toml | 1 + Changelog.md | 2 + src/module_writer.rs | 163 ++++++++++++++---- src/source_distribution.rs | 11 +- .../pyo3-mixed/pyo3_mixed/assets/asset.txt | 3 + test-crates/pyo3-mixed/pyproject.toml | 3 + tests/common/integration.rs | 17 ++ tests/common/other.rs | 7 +- 9 files changed, 170 insertions(+), 38 deletions(-) create mode 100644 test-crates/pyo3-mixed/pyo3_mixed/assets/asset.txt diff --git a/Cargo.lock b/Cargo.lock index 1d1cb5a7c..f8c58dcf5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1391,6 +1391,7 @@ dependencies = [ "rustls", "rustls-pemfile", "rustversion", + "same-file", "schemars", "semver", "serde", diff --git a/Cargo.toml b/Cargo.toml index 4697a044e..e2407aa59 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -82,6 +82,7 @@ normpath = "1.1.1" path-slash = "0.2.1" pep440_rs = { version = "0.5.0", features = ["serde", "tracing"] } pep508_rs = { version = "0.4.2", features = ["serde", "tracing"] } +same-file = "1.0.6" time = "0.3.17" url = "2.5.0" unicode-xid = { version = "0.2.4", optional = true } diff --git a/Changelog.md b/Changelog.md index 911178e20..5377ce850 100644 --- a/Changelog.md +++ b/Changelog.md @@ -2,6 +2,8 @@ ## [Unreleased] +* Don't add files to an archive more than once [#2125](https://github.com/PyO3/maturin/issues/2125) + ## [1.6.0] - 2024-06-04 * Detect compiling from Linux gnu to Linux musl as cross compiling in [#2010](https://github.com/PyO3/maturin/pull/2010) diff --git a/src/module_writer.rs b/src/module_writer.rs index b6d9fce4d..309d0890b 100644 --- a/src/module_writer.rs +++ b/src/module_writer.rs @@ -19,8 +19,9 @@ use ignore::overrides::Override; use ignore::WalkBuilder; use indexmap::IndexMap; use normpath::PathExt as _; +use same_file::is_same_file; use sha2::{Digest, Sha256}; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::env; use std::ffi::OsStr; use std::fmt::Write as _; @@ -40,18 +41,28 @@ pub trait ModuleWriter { /// Adds a directory relative to the module base path fn add_directory(&mut self, path: impl AsRef) -> Result<()>; - /// Adds a file with bytes as content in target relative to the module base path - fn add_bytes(&mut self, target: impl AsRef, bytes: &[u8]) -> Result<()> { + /// Adds a file with bytes as content in target relative to the module base path. + /// + /// For generated files, `source` is `None`. + fn add_bytes( + &mut self, + target: impl AsRef, + source: Option<&Path>, + bytes: &[u8], + ) -> Result<()> { debug!("Adding {}", target.as_ref().display()); // 0o644 is the default from the zip crate - self.add_bytes_with_permissions(target, bytes, 0o644) + self.add_bytes_with_permissions(target, source, bytes, 0o644) } /// Adds a file with bytes as content in target relative to the module base path while setting /// the given unix permissions + /// + /// For generated files, `source` is `None`. fn add_bytes_with_permissions( &mut self, target: impl AsRef, + source: Option<&Path>, bytes: &[u8], permissions: u32, ) -> Result<()>; @@ -61,7 +72,7 @@ pub trait ModuleWriter { self.add_file_with_permissions(target, source, 0o644) } - /// Copies the source file the the target path relative to the module base path while setting + /// Copies the source file the target path relative to the module base path while setting /// the given unix permissions fn add_file_with_permissions( &mut self, @@ -77,7 +88,7 @@ pub trait ModuleWriter { let mut file = File::open(source).context(read_failed_context.clone())?; let mut buffer = Vec::new(); file.read_to_end(&mut buffer).context(read_failed_context)?; - self.add_bytes_with_permissions(target, &buffer, permissions) + self.add_bytes_with_permissions(target, Some(source), &buffer, permissions) .context(format!("Failed to write to {}", target.display()))?; Ok(()) } @@ -87,6 +98,7 @@ pub trait ModuleWriter { pub struct PathWriter { base_path: PathBuf, record: Vec<(String, String, usize)>, + file_tracker: FileTracker, } impl PathWriter { @@ -103,6 +115,7 @@ impl PathWriter { Ok(PathWriter { base_path, record: Vec::new(), + file_tracker: FileTracker::default(), }) } @@ -111,6 +124,7 @@ impl PathWriter { Self { base_path: path.as_ref().to_path_buf(), record: Vec::new(), + file_tracker: FileTracker::default(), } } @@ -169,11 +183,17 @@ impl ModuleWriter for PathWriter { fn add_bytes_with_permissions( &mut self, target: impl AsRef, + source: Option<&Path>, bytes: &[u8], - _permissions: u32, + permissions: u32, ) -> Result<()> { let path = self.base_path.join(&target); + if !self.file_tracker.add_file(target.as_ref(), source)? { + // Ignore duplicate files. + return Ok(()); + } + // We only need to set the executable bit on unix let mut file = { #[cfg(target_family = "unix")] @@ -182,7 +202,7 @@ impl ModuleWriter for PathWriter { .create(true) .write(true) .truncate(true) - .mode(_permissions) + .mode(permissions) .open(&path) } #[cfg(target_os = "windows")] @@ -212,6 +232,7 @@ pub struct WheelWriter { record: Vec<(String, String, usize)>, record_file: PathBuf, wheel_path: PathBuf, + file_tracker: FileTracker, excludes: Override, } @@ -223,6 +244,7 @@ impl ModuleWriter for WheelWriter { fn add_bytes_with_permissions( &mut self, target: impl AsRef, + source: Option<&Path>, bytes: &[u8], permissions: u32, ) -> Result<()> { @@ -230,6 +252,12 @@ impl ModuleWriter for WheelWriter { if self.exclude(target) { return Ok(()); } + + if !self.file_tracker.add_file(target, source)? { + // Ignore duplicate files. + return Ok(()); + } + // The zip standard mandates using unix style paths let target = target.to_str().unwrap().replace('\\', "/"); @@ -284,6 +312,7 @@ impl WheelWriter { record: Vec::new(), record_file: metadata23.get_dist_info_dir().join("RECORD"), wheel_path, + file_tracker: FileTracker::default(), excludes, }; @@ -313,7 +342,7 @@ impl WheelWriter { let name = metadata23.get_distribution_escaped(); let target = format!("{name}.pth"); debug!("Adding {} from {}", target, python_path); - self.add_bytes(target, python_path.as_bytes())?; + self.add_bytes(target, None, python_path.as_bytes())?; } else { eprintln!("⚠️ source code path contains non-Unicode sequences, editable installs may not work."); } @@ -375,7 +404,7 @@ impl WheelWriter { pub struct SDistWriter { tar: tar::Builder>>, path: PathBuf, - files: HashSet, + file_tracker: FileTracker, excludes: Override, } @@ -387,6 +416,7 @@ impl ModuleWriter for SDistWriter { fn add_bytes_with_permissions( &mut self, target: impl AsRef, + source: Option<&Path>, bytes: &[u8], permissions: u32, ) -> Result<()> { @@ -395,8 +425,8 @@ impl ModuleWriter for SDistWriter { return Ok(()); } - if self.files.contains(target) { - // Ignore duplicate files + if !self.file_tracker.add_file(target, source)? { + // Ignore duplicate files. return Ok(()); } @@ -411,7 +441,6 @@ impl ModuleWriter for SDistWriter { bytes.len(), target.display() ))?; - self.files.insert(target.to_path_buf()); Ok(()) } @@ -421,12 +450,12 @@ impl ModuleWriter for SDistWriter { return Ok(()); } let target = target.as_ref(); - if self.files.contains(target) { - // Ignore duplicate files + if !self.file_tracker.add_file(target, Some(source))? { + // Ignore duplicate files. return Ok(()); } - debug!("Adding {} from {}", target.display(), source.display()); + debug!("Adding {} from {}", target.display(), source.display()); self.tar .append_path_with_name(source, target) .context(format!( @@ -434,7 +463,6 @@ impl ModuleWriter for SDistWriter { source.display(), target.display(), ))?; - self.files.insert(target.to_path_buf()); Ok(()) } } @@ -462,7 +490,7 @@ impl SDistWriter { Ok(Self { tar, path, - files: HashSet::new(), + file_tracker: FileTracker::default(), excludes, }) } @@ -480,6 +508,63 @@ impl SDistWriter { } } +/// Keep track of which files we added from where, so we can skip duplicate files and error when +/// adding conflicting files. +/// +/// The wrapped type contains as key the path added to the archive and as value the originating path +/// on the file system or `None` for generated files. +#[derive(Default)] +struct FileTracker(HashMap>); + +impl FileTracker { + /// Returns `true` if the file should be added, `false` if an identical file was already added + /// (skip) and an error if a different file was already added. + fn add_file(&mut self, target: &Path, source: Option<&Path>) -> Result { + let Some(previous_source) = self + .0 + .insert(target.to_path_buf(), source.map(|path| path.to_path_buf())) + else { + // The path doesn't exist in the archive yet. + return Ok(true); + }; + match (previous_source, source) { + (None, None) => { + bail!( + "Generated file {} was already added, can't add it again", + target.display() + ); + } + (Some(previous_source), None) => { + bail!( + "File {} was already added from {}, can't overwrite with generated file", + target.display(), + previous_source.display() + ) + } + (None, Some(source)) => { + bail!( + "Generated file {} was already added, can't overwrite it with {}", + target.display(), + source.display() + ); + } + (Some(previous_source), Some(source)) => { + if is_same_file(source, &previous_source).unwrap_or(false) { + // Ignore identical duplicate files + Ok(false) + } else { + bail!( + "File {} was already added from {}, can't added it from {}", + target.display(), + previous_source.display(), + source.display() + ); + } + } + } + } +} + fn wheel_file(tags: &[String]) -> Result { let mut wheel_file = format!( "Wheel-Version: 1.0 @@ -769,6 +854,7 @@ pub fn write_bindings_module( // Reexport the shared library as if it were the top level module writer.add_bytes( &module.join("__init__.py"), + None, format!( r#"from .{ext_name} import * @@ -782,7 +868,7 @@ if hasattr({ext_name}, "__all__"): if type_stub.exists() { eprintln!("📖 Found type stub file at {ext_name}.pyi"); writer.add_file(&module.join("__init__.pyi"), type_stub)?; - writer.add_bytes(&module.join("py.typed"), b"")?; + writer.add_bytes(&module.join("py.typed"), None, b"")?; } writer.add_file_with_permissions(&module.join(so_filename), artifact, 0o755)?; } @@ -846,16 +932,17 @@ pub fn write_cffi_module( if type_stub.exists() { eprintln!("📖 Found type stub file at {module_name}.pyi"); writer.add_file(&module.join("__init__.pyi"), type_stub)?; - writer.add_bytes(&module.join("py.typed"), b"")?; + writer.add_bytes(&module.join("py.typed"), None, b"")?; } }; if !editable || project_layout.python_module.is_none() { writer.add_bytes( &module.join("__init__.py"), + None, cffi_init_file(&project_layout.extension_name).as_bytes(), )?; - writer.add_bytes(&module.join("ffi.py"), cffi_declarations.as_bytes())?; + writer.add_bytes(&module.join("ffi.py"), None, cffi_declarations.as_bytes())?; writer.add_file_with_permissions( &module.join(format!( "{extension_name}.so", @@ -1088,12 +1175,12 @@ pub fn write_uniffi_module( if type_stub.exists() { eprintln!("📖 Found type stub file at {module_name}.pyi"); writer.add_file(&module.join("__init__.pyi"), type_stub)?; - writer.add_bytes(&module.join("py.typed"), b"")?; + writer.add_bytes(&module.join("py.typed"), None, b"")?; } }; if !editable || project_layout.python_module.is_none() { - writer.add_bytes(&module.join("__init__.py"), py_init.as_bytes())?; + writer.add_bytes(&module.join("__init__.py"), None, py_init.as_bytes())?; writer.add_file( module.join(binding_name).with_extension("py"), uniffi_binding, @@ -1175,7 +1262,7 @@ if __name__ == '__main__': let launcher_path = Path::new(&metadata.get_distribution_escaped()) .join(bin_name.replace('-', "_")) .with_extension("py"); - writer.add_bytes_with_permissions(&launcher_path, entrypoint_script.as_bytes(), 0o755)?; + writer.add_bytes_with_permissions(&launcher_path, None, entrypoint_script.as_bytes(), 0o755)?; Ok(()) } @@ -1267,10 +1354,15 @@ pub fn write_dist_info( writer.add_bytes( &dist_info_dir.join("METADATA"), + None, metadata23.to_file_contents()?.as_bytes(), )?; - writer.add_bytes(&dist_info_dir.join("WHEEL"), wheel_file(tags)?.as_bytes())?; + writer.add_bytes( + &dist_info_dir.join("WHEEL"), + None, + wheel_file(tags)?.as_bytes(), + )?; let mut entry_points = String::new(); if !metadata23.scripts.is_empty() { @@ -1285,6 +1377,7 @@ pub fn write_dist_info( if !entry_points.is_empty() { writer.add_bytes( &dist_info_dir.join("entry_points.txt"), + None, entry_points.as_bytes(), )?; } @@ -1379,9 +1472,9 @@ mod tests { // No excludes let tmp_dir = TempDir::new()?; let mut writer = SDistWriter::new(&tmp_dir, &metadata, Override::empty())?; - assert!(writer.files.is_empty()); - writer.add_bytes_with_permissions("test", &[], perm)?; - assert_eq!(writer.files.len(), 1); + assert!(writer.file_tracker.0.is_empty()); + writer.add_bytes_with_permissions("test", Some(Path::new("test")), &[], perm)?; + assert_eq!(writer.file_tracker.0.len(), 1); writer.finish()?; tmp_dir.close()?; @@ -1391,13 +1484,13 @@ mod tests { excludes.add("test*")?; excludes.add("!test2")?; let mut writer = SDistWriter::new(&tmp_dir, &metadata, excludes.build()?)?; - writer.add_bytes_with_permissions("test1", &[], perm)?; - writer.add_bytes_with_permissions("test3", &[], perm)?; - assert!(writer.files.is_empty()); - writer.add_bytes_with_permissions("test2", &[], perm)?; - assert!(!writer.files.is_empty()); - writer.add_bytes_with_permissions("yes", &[], perm)?; - assert_eq!(writer.files.len(), 2); + writer.add_bytes_with_permissions("test1", Some(Path::new("test1")), &[], perm)?; + writer.add_bytes_with_permissions("test3", Some(Path::new("test3")), &[], perm)?; + assert!(writer.file_tracker.0.is_empty()); + writer.add_bytes_with_permissions("test2", Some(Path::new("test2")), &[], perm)?; + assert!(!writer.file_tracker.0.is_empty()); + writer.add_bytes_with_permissions("yes", Some(Path::new("yes")), &[], perm)?; + assert_eq!(writer.file_tracker.0.len(), 2); writer.finish()?; tmp_dir.close()?; diff --git a/src/source_distribution.rs b/src/source_distribution.rs index d38d4fe9d..8502e00ca 100644 --- a/src/source_distribution.rs +++ b/src/source_distribution.rs @@ -218,7 +218,11 @@ fn add_crate_to_source_distribution( if root_crate { let rewritten_cargo_toml = rewrite_cargo_toml(manifest_path, known_path_deps)?; - writer.add_bytes(cargo_toml_path, rewritten_cargo_toml.as_bytes())?; + writer.add_bytes( + cargo_toml_path, + Some(manifest_path), + rewritten_cargo_toml.as_bytes(), + )?; } else if !skip_cargo_toml { writer.add_file(cargo_toml_path, manifest_path)?; } @@ -520,9 +524,10 @@ fn add_cargo_package_files_to_sdist( readme: None, }, ); - let workspace_cargo_toml = rewrite_cargo_toml(workspace_manifest_path, &deps_to_keep)?; + let workspace_cargo_toml = rewrite_cargo_toml(&workspace_manifest_path, &deps_to_keep)?; writer.add_bytes( root_dir.join(relative_workspace_cargo_toml), + Some(workspace_manifest_path.as_std_path()), workspace_cargo_toml.as_bytes(), )?; } @@ -545,6 +550,7 @@ fn add_cargo_package_files_to_sdist( )?; writer.add_bytes( root_dir.join("pyproject.toml"), + Some(pyproject_toml_path), rewritten_pyproject_toml.as_bytes(), )?; } else { @@ -684,6 +690,7 @@ pub fn source_distribution( writer.add_bytes( root_dir.join("PKG-INFO"), + None, metadata23.to_file_contents()?.as_bytes(), )?; diff --git a/test-crates/pyo3-mixed/pyo3_mixed/assets/asset.txt b/test-crates/pyo3-mixed/pyo3_mixed/assets/asset.txt new file mode 100644 index 000000000..273a76140 --- /dev/null +++ b/test-crates/pyo3-mixed/pyo3_mixed/assets/asset.txt @@ -0,0 +1,3 @@ +This file is included twice, once implicitly by in the python root and once explicitly in pyproject.toml +and checks that it gets deduplicated, see https://github.com/PyO3/maturin/issues/2106 and +https://github.com/PyO3/maturin/issues/2066. diff --git a/test-crates/pyo3-mixed/pyproject.toml b/test-crates/pyo3-mixed/pyproject.toml index e50b8b313..52a69bbfe 100644 --- a/test-crates/pyo3-mixed/pyproject.toml +++ b/test-crates/pyo3-mixed/pyproject.toml @@ -14,3 +14,6 @@ dependencies = ["boltons"] [project.scripts] get_42 = "pyo3_mixed:get_42" print_cli_args = "pyo3_mixed:print_cli_args" + +[tool.maturin] +include = ["pyo3_mixed/assets/*"] diff --git a/tests/common/integration.rs b/tests/common/integration.rs index 363f6d141..f2c89e18a 100644 --- a/tests/common/integration.rs +++ b/tests/common/integration.rs @@ -3,7 +3,9 @@ use anyhow::{bail, Context, Result}; #[cfg(feature = "zig")] use cargo_zigbuild::Zig; use clap::Parser; +use fs_err::File; use maturin::{BuildOptions, PlatformTag, PythonInterpreter}; +use std::collections::HashSet; use std::env; use std::path::Path; use std::process::Command; @@ -93,6 +95,7 @@ pub fn test_integration( // We can do this since we know that wheels are built and returned in the // order they are in the build context for ((filename, supported_version), python_interpreter) in wheels.iter().zip(interpreter) { + check_for_duplicates(filename)?; if test_zig && build_context.target.is_linux() && !build_context.target.is_musl_libc() @@ -231,3 +234,17 @@ pub fn test_integration_conda(package: impl AsRef, bindings: Option and +/// . +fn check_for_duplicates(wheel: &Path) -> Result<()> { + let mut seen = HashSet::new(); + let mut reader = File::open(wheel)?; + // We have to use this API since `ZipArchive` deduplicates names. + while let Some(file) = zip::read::read_zipfile_from_stream(&mut reader)? { + if !seen.insert(file.name().to_string()) { + bail!("Duplicate file: {}", file.name()); + } + } + Ok(()) +} diff --git a/tests/common/other.rs b/tests/common/other.rs index 91eebc980..2ab2136a6 100644 --- a/tests/common/other.rs +++ b/tests/common/other.rs @@ -176,7 +176,12 @@ pub fn test_source_distribution( } } expected_files.assert_debug_eq(&files); - assert_eq!(file_count, files.len(), "duplicated files found in sdist"); + assert_eq!( + file_count, + files.len(), + "duplicated files found in sdist: {:?}", + files + ); if let Some((cargo_toml_path, expected)) = expected_cargo_toml { let cargo_toml = cargo_toml