Skip to content

Commit

Permalink
Support hard links for similar images and videos with -L (#1201)
Browse files Browse the repository at this point in the history
This ignores matches for files that have the same inode.

This only works on Unix.
  • Loading branch information
blob79 authored Feb 14, 2024
1 parent 0cc115c commit b63c631
Show file tree
Hide file tree
Showing 5 changed files with 234 additions and 12 deletions.
4 changes: 4 additions & 0 deletions czkawka_cli/src/commands.rs
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,8 @@ pub struct SimilarImagesArgs {
#[clap(flatten)]
pub delete_method: DMethod,
#[clap(flatten)]
pub allow_hard_links: AllowHardLinks,
#[clap(flatten)]
pub dry_run: DryRun,
#[clap(
short = 'g',
Expand Down Expand Up @@ -355,6 +357,8 @@ pub struct SimilarVideosArgs {
#[clap(flatten)]
pub delete_method: DMethod,
#[clap(flatten)]
pub allow_hard_links: AllowHardLinks,
#[clap(flatten)]
pub dry_run: DryRun,
#[clap(
short,
Expand Down
4 changes: 4 additions & 0 deletions czkawka_cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ fn similar_images(similar_images: SimilarImagesArgs, stop_receiver: &Receiver<()
hash_size,
delete_method,
dry_run,
allow_hard_links,
} = similar_images;

let mut item = SimilarImages::new();
Expand All @@ -198,6 +199,7 @@ fn similar_images(similar_images: SimilarImagesArgs, stop_receiver: &Receiver<()
item.set_delete_method(delete_method.delete_method);
item.set_dry_run(dry_run.dry_run);
item.set_similarity(return_similarity_from_similarity_preset(&similarity_preset, hash_size));
item.set_ignore_hard_links(!allow_hard_links.allow_hard_links);

item.find_similar_images(Some(stop_receiver), Some(progress_sender));

Expand Down Expand Up @@ -272,6 +274,7 @@ fn similar_videos(similar_videos: SimilarVideosArgs, stop_receiver: &Receiver<()
maximal_file_size,
delete_method,
dry_run,
allow_hard_links,
} = similar_videos;

let mut item = SimilarVideos::new();
Expand All @@ -282,6 +285,7 @@ fn similar_videos(similar_videos: SimilarVideosArgs, stop_receiver: &Receiver<()
item.set_tolerance(tolerance);
item.set_delete_method(delete_method.delete_method);
item.set_dry_run(dry_run.dry_run);
item.set_ignore_hard_links(!allow_hard_links.allow_hard_links);

item.find_similar_videos(Some(stop_receiver), Some(progress_sender));

Expand Down
210 changes: 206 additions & 4 deletions czkawka_core/src/common_dir_traversal.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use std::collections::BTreeMap;
use std::fs;
use std::fs::{DirEntry, FileType, Metadata, ReadDir};
use std::fs::{DirEntry, FileType, Metadata};
#[cfg(target_family = "unix")]
use std::os::unix::fs::MetadataExt;
use std::path::{Path, PathBuf};
use std::sync::atomic::Ordering;
use std::time::UNIX_EPOCH;
Expand Down Expand Up @@ -92,7 +94,7 @@ pub enum Collect {
Files,
}

#[derive(Eq, PartialEq, Copy, Clone)]
#[derive(Eq, PartialEq, Copy, Clone, Debug)]
enum EntryType {
File,
Dir,
Expand Down Expand Up @@ -546,9 +548,17 @@ fn process_symlink_in_symlink_mode(
fe_result.push(fe);
}

pub fn common_read_dir(current_folder: &Path, warnings: &mut Vec<String>) -> Option<ReadDir> {
pub fn common_read_dir(current_folder: &Path, warnings: &mut Vec<String>) -> Option<Vec<Result<DirEntry, std::io::Error>>> {
match fs::read_dir(current_folder) {
Ok(t) => Some(t),
Ok(t) => {
// Make directory traversal order stable
let mut r: Vec<_> = t.collect();
r.sort_by_key(|d| match d {
Ok(f) => f.path(),
_ => PathBuf::new(),
});
Some(r)
}
Err(e) => {
warnings.push(flc!(
"core_cannot_open_dir",
Expand Down Expand Up @@ -634,3 +644,195 @@ pub fn get_modified_time(metadata: &Metadata, warnings: &mut Vec<String>, curren
}
}
}

#[cfg(target_family = "windows")]
pub fn inode(_fe: &FileEntry) -> Option<u64> {
None
}

#[cfg(target_family = "unix")]
pub fn inode(fe: &FileEntry) -> Option<u64> {
if let Ok(meta) = fs::metadata(&fe.path) {
Some(meta.ino())
} else {
None
}
}

pub fn take_1_per_inode((k, mut v): (Option<u64>, Vec<FileEntry>)) -> Vec<FileEntry> {
if k.is_some() {
v.drain(1..);
}
v
}

#[cfg(test)]
mod tests {
use super::*;
use crate::common_tool::*;
use once_cell::sync::Lazy;
use std::fs;
use std::fs::File;
use std::io;
use std::io::prelude::*;
use std::time::{Duration, SystemTime};
use tempfile::TempDir;

impl CommonData for CommonToolData {
fn get_cd(&self) -> &CommonToolData {
self
}
fn get_cd_mut(&mut self) -> &mut CommonToolData {
self
}
}

static NOW: Lazy<SystemTime> = Lazy::new(|| SystemTime::UNIX_EPOCH + Duration::new(100, 0));
const CONTENT: &[u8; 1] = b"a";

fn create_files(dir: &TempDir) -> io::Result<(PathBuf, PathBuf, PathBuf)> {
let (src, hard, other) = (dir.path().join("a"), dir.path().join("b"), dir.path().join("c"));

let mut file = File::create(&src)?;
file.write_all(CONTENT)?;
fs::hard_link(&src, &hard)?;
file.set_modified(*NOW)?;

let mut file = File::create(&other)?;
file.write_all(CONTENT)?;
file.set_modified(*NOW)?;
Ok((src, hard, other))
}

#[test]
fn test_traversal() -> io::Result<()> {
let dir = tempfile::Builder::new().tempdir()?;
let (src, hard, other) = create_files(&dir)?;
let secs = NOW.duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs();

let mut common_data = CommonToolData::new(ToolType::SimilarImages);
common_data.directories.set_included_directory([dir.path().to_owned()].to_vec());
common_data.set_minimal_file_size(0);

match DirTraversalBuilder::new().group_by(|_fe| ()).common_data(&common_data).build().run() {
DirTraversalResult::SuccessFiles {
warnings: _,
grouped_file_entries,
} => {
let actual: Vec<_> = grouped_file_entries.into_values().flatten().collect();
assert_eq!(
[
FileEntry {
path: src,
size: 1,
modified_date: secs,
},
FileEntry {
path: hard,
size: 1,
modified_date: secs,
},
FileEntry {
path: other,
size: 1,
modified_date: secs,
},
]
.to_vec(),
actual
);
}
_ => {
panic!("Expect SuccessFiles.");
}
};
Ok(())
}

#[cfg(target_family = "unix")]
#[test]
fn test_traversal_group_by_inode() -> io::Result<()> {
let dir = tempfile::Builder::new().tempdir()?;
let (src, _, other) = create_files(&dir)?;
let secs = NOW.duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs();

let mut common_data = CommonToolData::new(ToolType::SimilarImages);
common_data.directories.set_included_directory([dir.path().to_owned()].to_vec());
common_data.set_minimal_file_size(0);

match DirTraversalBuilder::new().group_by(inode).common_data(&common_data).build().run() {
DirTraversalResult::SuccessFiles {
warnings: _,
grouped_file_entries,
} => {
let actual: Vec<_> = grouped_file_entries.into_iter().flat_map(take_1_per_inode).collect();
assert_eq!(
[
FileEntry {
path: src,
size: 1,
modified_date: secs,
},
FileEntry {
path: other,
size: 1,
modified_date: secs,
},
]
.to_vec(),
actual
);
}
_ => {
panic!("Expect SuccessFiles.");
}
};
Ok(())
}

#[cfg(target_family = "windows")]
#[test]
fn test_traversal_group_by_inode() -> io::Result<()> {
let dir = tempfile::Builder::new().tempdir()?;
let (src, hard, other) = create_files(&dir)?;
let secs = NOW.duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs();

let mut common_data = CommonToolData::new(ToolType::SimilarImages);
common_data.directories.set_included_directory([dir.path().to_owned()].to_vec());
common_data.set_minimal_file_size(0);

match DirTraversalBuilder::new().group_by(inode).common_data(&common_data).build().run() {
DirTraversalResult::SuccessFiles {
warnings: _,
grouped_file_entries,
} => {
let actual: Vec<_> = grouped_file_entries.into_iter().flat_map(take_1_per_inode).collect();
assert_eq!(
[
FileEntry {
path: src,
size: 1,
modified_date: secs,
},
FileEntry {
path: hard,
size: 1,
modified_date: secs,
},
FileEntry {
path: other,
size: 1,
modified_date: secs,
},
]
.to_vec(),
actual
);
}
_ => {
panic!("Expect SuccessFiles.");
}
};
Ok(())
}
}
14 changes: 10 additions & 4 deletions czkawka_core/src/similar_images.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ use crate::common::{
HEIC_EXTENSIONS, IMAGE_RS_SIMILAR_IMAGES_EXTENSIONS, RAW_IMAGE_EXTENSIONS,
};
use crate::common_cache::{get_similar_images_cache_file, load_cache_from_file_generalized_by_path, save_cache_to_file_generalized};
use crate::common_dir_traversal::{CheckingMethod, DirTraversalBuilder, DirTraversalResult, FileEntry, ProgressData, ToolType};
use crate::common_dir_traversal::{inode, take_1_per_inode, CheckingMethod, DirTraversalBuilder, DirTraversalResult, FileEntry, ProgressData, ToolType};
use crate::common_tool::{CommonData, CommonToolData, DeleteMethod};
use crate::common_traits::{DebugPrint, PrintResults, ResultEntry};
use crate::flc;
Expand Down Expand Up @@ -122,6 +122,7 @@ pub struct SimilarImages {
hash_alg: HashAlg,
image_filter: FilterType,
exclude_images_with_same_size: bool,
ignore_hard_links: bool,
}

#[derive(Default)]
Expand All @@ -145,6 +146,7 @@ impl SimilarImages {
hash_alg: HashAlg::Gradient,
image_filter: FilterType::Lanczos3,
exclude_images_with_same_size: false,
ignore_hard_links: false,
}
}

Expand Down Expand Up @@ -188,7 +190,7 @@ impl SimilarImages {
let heic_extensions = HEIC_EXTENSIONS.iter().collect::<HashSet<_>>();

let result = DirTraversalBuilder::new()
.group_by(|_fe| ())
.group_by(inode)
.stop_receiver(stop_receiver)
.progress_sender(progress_sender)
.common_data(&self.common_data)
Expand All @@ -199,8 +201,8 @@ impl SimilarImages {
match result {
DirTraversalResult::SuccessFiles { grouped_file_entries, warnings } => {
self.images_to_check = grouped_file_entries
.into_values()
.flatten()
.into_iter()
.flat_map(if self.ignore_hard_links { |(_, fes)| fes } else { take_1_per_inode })
.map(|fe| {
let fe_str = fe.path.to_string_lossy().to_string();
let extension_lowercase = fe.path.extension().unwrap_or_default().to_string_lossy().to_lowercase();
Expand Down Expand Up @@ -1090,6 +1092,10 @@ impl SimilarImages {
pub fn set_similarity(&mut self, similarity: u32) {
self.similarity = similarity;
}

pub fn set_ignore_hard_links(&mut self, ignore_hard_links: bool) {
self.ignore_hard_links = ignore_hard_links;
}
}

#[cfg(test)]
Expand Down
14 changes: 10 additions & 4 deletions czkawka_core/src/similar_videos.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use vid_dup_finder_lib::{NormalizedTolerance, VideoHash};

use crate::common::{check_if_stop_received, delete_files_custom, prepare_thread_handler_common, send_info_and_wait_for_ending_all_threads, VIDEO_FILES_EXTENSIONS};
use crate::common_cache::{get_similar_videos_cache_file, load_cache_from_file_generalized_by_path, save_cache_to_file_generalized};
use crate::common_dir_traversal::{CheckingMethod, DirTraversalBuilder, DirTraversalResult, FileEntry, ProgressData, ToolType};
use crate::common_dir_traversal::{inode, take_1_per_inode, CheckingMethod, DirTraversalBuilder, DirTraversalResult, FileEntry, ProgressData, ToolType};
use crate::common_tool::{CommonData, CommonToolData, DeleteMethod};
use crate::common_traits::{DebugPrint, PrintResults, ResultEntry};
use crate::flc;
Expand Down Expand Up @@ -83,6 +83,7 @@ pub struct SimilarVideos {
videos_to_check: BTreeMap<String, VideosEntry>,
tolerance: i32,
exclude_videos_with_same_size: bool,
ignore_hard_links: bool,
}

impl CommonData for SimilarVideos {
Expand Down Expand Up @@ -111,6 +112,7 @@ impl SimilarVideos {
tolerance: 10,
exclude_videos_with_same_size: false,
similar_referenced_vectors: vec![],
ignore_hard_links: false,
}
}

Expand Down Expand Up @@ -149,7 +151,7 @@ impl SimilarVideos {
}

let result = DirTraversalBuilder::new()
.group_by(|_fe| ())
.group_by(inode)
.stop_receiver(stop_receiver)
.progress_sender(progress_sender)
.common_data(&self.common_data)
Expand All @@ -160,8 +162,8 @@ impl SimilarVideos {
match result {
DirTraversalResult::SuccessFiles { grouped_file_entries, warnings } => {
self.videos_to_check = grouped_file_entries
.into_values()
.flatten()
.into_iter()
.flat_map(if self.ignore_hard_links { |(_, fes)| fes } else { take_1_per_inode })
.map(|fe| (fe.path.to_string_lossy().to_string(), fe.into_videos_entry()))
.collect();
self.common_data.text_messages.warnings.extend(warnings);
Expand Down Expand Up @@ -454,4 +456,8 @@ impl SimilarVideos {
pub fn get_use_reference(&self) -> bool {
self.common_data.use_reference_folders
}

pub fn set_ignore_hard_links(&mut self, ignore_hard_links: bool) {
self.ignore_hard_links = ignore_hard_links;
}
}

0 comments on commit b63c631

Please sign in to comment.