From 8197bf677ea34b5969e0b7b03d1356719601d9b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mikrut?= Date: Fri, 6 Oct 2023 21:09:35 +0200 Subject: [PATCH] Simplified a lot of cache concept --- czkawka_core/src/common_cache.rs | 109 ++++++----- czkawka_core/src/duplicate.rs | 175 ++++-------------- .../src/connect_things/connect_settings.rs | 15 +- 3 files changed, 115 insertions(+), 184 deletions(-) diff --git a/czkawka_core/src/common_cache.rs b/czkawka_core/src/common_cache.rs index 76af08dfc..b5268a782 100644 --- a/czkawka_core/src/common_cache.rs +++ b/czkawka_core/src/common_cache.rs @@ -85,26 +85,86 @@ where pub fn load_cache_from_file_generalized_by_path(cache_file_name: &str, delete_outdated_cache: bool, used_files: &BTreeMap) -> (Messages, Option>) where - for<'a> T: Deserialize<'a> + ResultEntry + Sized + Send + Sync, + for<'a> T: Deserialize<'a> + ResultEntry + Sized + Send + Sync + Clone, { let (text_messages, vec_loaded_cache) = load_cache_from_file_generalized(cache_file_name, delete_outdated_cache, used_files); let Some(vec_loaded_entries) = vec_loaded_cache else { return (text_messages, None); }; - debug!("Converting cache vec into map"); + debug!("Converting cache Vec into BTreeMap"); let map_loaded_entries: BTreeMap = vec_loaded_entries .into_iter() .map(|file_entry| (file_entry.get_path().to_string_lossy().into_owned(), file_entry)) .collect(); - debug!("Converted cache vec into map"); + debug!("Converted cache Vec into BTreeMap"); + + (text_messages, Some(map_loaded_entries)) +} + +pub fn load_cache_from_file_generalized_by_size( + cache_file_name: &str, + delete_outdated_cache: bool, + cache_not_converted: &BTreeMap>, +) -> (Messages, Option>>) +where + for<'a> T: Deserialize<'a> + ResultEntry + Sized + Send + Sync + Clone, +{ + debug!("Converting cache BtreeMap> into BTreeMap"); + let mut used_files: BTreeMap = Default::default(); + for file_entry in cache_not_converted.values().flatten() { + used_files.insert(file_entry.get_path().to_string_lossy().into_owned(), file_entry.clone()); + } + debug!("Converted cache BtreeMap> into BTreeMap"); + + let (text_messages, vec_loaded_cache) = load_cache_from_file_generalized(cache_file_name, delete_outdated_cache, &used_files); + let Some(vec_loaded_entries) = vec_loaded_cache else { + return (text_messages, None); + }; + + debug!("Converting cache Vec into BTreeMap>"); + let mut map_loaded_entries: BTreeMap> = Default::default(); + for file_entry in vec_loaded_entries { + map_loaded_entries.entry(file_entry.get_size()).or_default().push(file_entry); + } + debug!("Converted cache Vec into BTreeMap>"); + + (text_messages, Some(map_loaded_entries)) +} + +pub fn load_cache_from_file_generalized_by_path_from_size( + cache_file_name: &str, + delete_outdated_cache: bool, + cache_not_converted: &BTreeMap>, +) -> (Messages, Option>) +where + for<'a> T: Deserialize<'a> + ResultEntry + Sized + Send + Sync + Clone, +{ + debug!("Converting cache BtreeMap> into BTreeMap"); + let mut used_files: BTreeMap = Default::default(); + for file_entry in cache_not_converted.values().flatten() { + used_files.insert(file_entry.get_path().to_string_lossy().into_owned(), file_entry.clone()); + } + debug!("Converted cache BtreeMap> into BTreeMap"); + + let (text_messages, vec_loaded_cache) = load_cache_from_file_generalized(cache_file_name, delete_outdated_cache, &used_files); + let Some(vec_loaded_entries) = vec_loaded_cache else { + return (text_messages, None); + }; + + debug!("Converting cache Vec into BTreeMap"); + let map_loaded_entries: BTreeMap = vec_loaded_entries + .into_iter() + .map(|file_entry| (file_entry.get_path().to_string_lossy().into_owned(), file_entry)) + .collect(); + debug!("Converted cache Vec into BTreeMap"); (text_messages, Some(map_loaded_entries)) } fn load_cache_from_file_generalized(cache_file_name: &str, delete_outdated_cache: bool, used_files: &BTreeMap) -> (Messages, Option>) where - for<'a> T: Deserialize<'a> + ResultEntry + Sized + Send + Sync, + for<'a> T: Deserialize<'a> + ResultEntry + Sized + Send + Sync + Clone, { debug!("Loading cache from file {} (or json alternative)", cache_file_name); let mut text_messages = Messages::new(); @@ -175,44 +235,3 @@ where debug!("Failed to load cache from file {cache_file_name} because not exists"); (text_messages, None) } - -// pub fn save_hashes_to_file(cache_file_name: &str, hashmap: &BTreeMap>, save_also_as_json: bool) -> Messages -// where -// T: Serialize + ResultEntry + Sized + Send + Sync, -// { -// debug!("Saving cache to file {} (or also json alternative) - {} results", cache_file_name, hashmap.len()); -// let mut text_messages = Messages::new(); -// if let Some(((file_handler, cache_file), (file_handler_json, cache_file_json))) = -// common::open_cache_folder(cache_file_name, true, save_also_as_json, &mut text_messages.warnings) -// { -// { -// let writer = BufWriter::new(file_handler.unwrap()); // Unwrap because cannot fail here -// if let Err(e) = bincode::serialize_into(writer, &hashmap.values().collect::>()) { -// text_messages -// .warnings -// .push(format!("Cannot write data to cache file {}, reason {}", cache_file.display(), e)); -// debug!("Failed to save cache to file {:?}", cache_file); -// return text_messages; -// } -// debug!("Saved binary to file {:?}", cache_file); -// } -// if save_also_as_json { -// if let Some(file_handler_json) = file_handler_json { -// let writer = BufWriter::new(file_handler_json); -// if let Err(e) = serde_json::to_writer(writer, &hashmap.values().collect::>()) { -// text_messages -// .warnings -// .push(format!("Cannot write data to cache file {}, reason {}", cache_file_json.display(), e)); -// debug!("Failed to save cache to file {:?}", cache_file_json); -// return text_messages; -// } -// debug!("Saved json to file {:?}", cache_file_json); -// } -// } -// -// text_messages.messages.push(format!("Properly saved to file {} cache entries.", hashmap.len())); -// } else { -// debug!("Failed to save cache to file {cache_file_name} because not exists"); -// } -// text_messages -// } diff --git a/czkawka_core/src/duplicate.rs b/czkawka_core/src/duplicate.rs index 375d0fbbb..f552eed7d 100644 --- a/czkawka_core/src/duplicate.rs +++ b/czkawka_core/src/duplicate.rs @@ -4,10 +4,10 @@ use std::collections::HashSet; use std::fs::File; use std::hash::Hasher; use std::io::prelude::*; -use std::io::{self, BufReader, BufWriter, Error, ErrorKind}; +use std::io::{self, BufWriter, Error, ErrorKind}; #[cfg(target_family = "unix")] use std::os::unix::fs::MetadataExt; -use std::path::{Path, PathBuf}; +use std::path::Path; use std::sync::atomic::Ordering; use std::{fs, mem}; @@ -18,14 +18,12 @@ use log::{debug, info}; use rayon::prelude::*; use xxhash_rust::xxh3::Xxh3; -use crate::common::{open_cache_folder, prepare_thread_handler_common, send_info_and_wait_for_ending_all_threads}; -use crate::common_cache::{get_duplicate_cache_file, save_cache_to_file_generalized}; +use crate::common::{prepare_thread_handler_common, send_info_and_wait_for_ending_all_threads}; +use crate::common_cache::{get_duplicate_cache_file, load_cache_from_file_generalized_by_size, save_cache_to_file_generalized}; use crate::common_dir_traversal::{CheckingMethod, DirTraversalBuilder, DirTraversalResult, FileEntry, ProgressData, ToolType}; use crate::common_messages::Messages; use crate::common_tool::{CommonData, CommonToolData}; use crate::common_traits::*; -use crate::flc; -use crate::localizer_core::generate_translation_hashmap; const TEMP_HARDLINK_FILE: &str = "rzeczek.rxrxrxl"; @@ -536,32 +534,28 @@ impl DuplicateFinder { if self.use_prehash_cache { debug!("prehash_load_cache_at_start - using prehash cache start"); - loaded_hash_map = match load_hashes_from_file(&mut self.common_data.text_messages, self.common_data.delete_outdated_cache, &self.hash_type, true) { - Some(t) => t, - None => Default::default(), - }; - let mut loaded_hash_map2: BTreeMap = Default::default(); - for vec_file_entry in loaded_hash_map.values() { - for file_entry in vec_file_entry { - loaded_hash_map2.insert(file_entry.path.to_string_lossy().to_string(), file_entry.clone()); - } - } + let (messages, loaded_items) = load_cache_from_file_generalized_by_size::( + &get_duplicate_cache_file(&self.hash_type, true), + self.get_delete_outdated_cache(), + &self.files_with_identical_size, + ); + self.get_text_messages_mut().extend_with_another_messages(messages); + loaded_hash_map = loaded_items.unwrap_or_default(); - #[allow(clippy::if_same_then_else)] - for vec_file_entry in self.files_with_identical_size.values() { - for file_entry in vec_file_entry { - let name = file_entry.path.to_string_lossy().to_string(); - if !loaded_hash_map2.contains_key(&name) { - // If loaded data doesn't contains current image info - non_cached_files_to_check.entry(file_entry.size).or_default().push(file_entry.clone()); - } else if file_entry.size != loaded_hash_map2.get(&name).unwrap().size || file_entry.modified_date != loaded_hash_map2.get(&name).unwrap().modified_date { - // When size or modification date of image changed, then it is clear that is different image - non_cached_files_to_check.entry(file_entry.size).or_default().push(file_entry.clone()); - } else { - // Checking may be omitted when already there is entry with same size and modification date - records_already_cached.entry(file_entry.size).or_default().push(file_entry.clone()); + for (size, vec_file_entry) in mem::take(&mut self.files_with_identical_size) { + if let Some(vec_file_entry) = loaded_hash_map.get(&size) { + // TODO maybe hashset is not needed when using < 4 elements + let cached_path_entries = vec_file_entry.iter().map(|e| &e.path).collect::>(); + for file_entry in vec_file_entry { + if cached_path_entries.contains(&file_entry.path) { + records_already_cached.entry(size).or_default().push(file_entry.clone()); + } else { + non_cached_files_to_check.entry(size).or_default().push(file_entry.clone()); + } } + } else { + non_cached_files_to_check.entry(size).or_default().append(&mut vec_file_entry.clone()); } } } else { @@ -693,35 +687,28 @@ impl DuplicateFinder { if self.common_data.use_cache { debug!("full_hashing_load_cache_at_start - using cache"); - loaded_hash_map = match load_hashes_from_file(&mut self.common_data.text_messages, self.common_data.delete_outdated_cache, &self.hash_type, false) { - Some(t) => t, - None => Default::default(), - }; + let (messages, loaded_items) = + load_cache_from_file_generalized_by_size::(&get_duplicate_cache_file(&self.hash_type, false), self.get_delete_outdated_cache(), &pre_checked_map); + self.get_text_messages_mut().extend_with_another_messages(messages); + loaded_hash_map = loaded_items.unwrap_or_default(); + debug!("full_hashing_load_cache_at_start - started diff between loaded and prechecked files"); for (size, vec_file_entry) in pre_checked_map { - #[allow(clippy::collapsible_if)] - if !loaded_hash_map.contains_key(&size) { - // If loaded data doesn't contains current info - non_cached_files_to_check.insert(size, vec_file_entry); - } else { - let loaded_vec_file_entry = loaded_hash_map.get(&size).unwrap(); - + if let Some(vec_file_entry) = loaded_hash_map.get(&size) { + // TODO maybe hashset is not needed when using < 4 elements + let cached_path_entries = vec_file_entry.iter().map(|e| &e.path).collect::>(); for file_entry in vec_file_entry { - let mut found: bool = false; - for loaded_file_entry in loaded_vec_file_entry { - if file_entry.path == loaded_file_entry.path && file_entry.modified_date == loaded_file_entry.modified_date { - records_already_cached.entry(file_entry.size).or_default().push(loaded_file_entry.clone()); - found = true; - break; - } - } - - if !found { - non_cached_files_to_check.entry(file_entry.size).or_default().push(file_entry); + if cached_path_entries.contains(&file_entry.path) { + records_already_cached.entry(size).or_default().push(file_entry.clone()); + } else { + non_cached_files_to_check.entry(size).or_default().push(file_entry.clone()); } } + } else { + non_cached_files_to_check.entry(size).or_default().append(&mut vec_file_entry.clone()); } } + debug!("full_hashing_load_cache_at_start - completed diff between loaded and prechecked files"); } else { debug!("full_hashing_load_cache_at_start - not using cache"); loaded_hash_map = Default::default(); @@ -1328,86 +1315,6 @@ pub fn make_hard_link(src: &Path, dst: &Path) -> io::Result<()> { result } -pub fn load_hashes_from_file(text_messages: &mut Messages, delete_outdated_cache: bool, type_of_hash: &HashType, is_prehash: bool) -> Option>> { - if let Some(((file_handler, cache_file), (_json_file, _json_name))) = - open_cache_folder(&get_file_hash_name(type_of_hash, is_prehash), false, false, &mut text_messages.warnings) - { - // Unwrap could fail when failed to open cache file, but json would exists - let Some(file_handler) = file_handler else { - return Default::default(); - }; - let reader = BufReader::new(file_handler); - - let mut hashmap_loaded_entries: BTreeMap> = Default::default(); - - // Read the file line by line using the lines() iterator from std::io::BufRead. - for (index, line) in reader.lines().enumerate() { - let line = match line { - Ok(t) => t, - Err(e) => { - text_messages - .warnings - .push(format!("Failed to load line number {} from cache file {}, reason {}", index + 1, cache_file.display(), e)); - return None; - } - }; - let uuu = line.split("//").collect::>(); - if uuu.len() != 4 { - text_messages.warnings.push(format!( - "Found invalid data(too much or too low amount of data) in line {} - ({}) in cache file {}", - index + 1, - line, - cache_file.display() - )); - continue; - } - // Don't load cache data if destination file not exists - if !delete_outdated_cache || Path::new(uuu[0]).exists() { - let file_entry = FileEntry { - path: PathBuf::from(uuu[0]), - size: match uuu[1].parse::() { - Ok(t) => t, - Err(e) => { - text_messages.warnings.push(format!( - "Found invalid size value in line {} - ({}) in cache file {}, reason {}", - index + 1, - line, - cache_file.display(), - e - )); - continue; - } - }, - modified_date: match uuu[2].parse::() { - Ok(t) => t, - Err(e) => { - text_messages.warnings.push(format!( - "Found invalid modified date value in line {} - ({}) in cache file {}, reason {}", - index + 1, - line, - cache_file.display(), - e - )); - continue; - } - }, - hash: uuu[3].to_string(), - symlink_info: None, - }; - hashmap_loaded_entries.entry(file_entry.size).or_default().push(file_entry); - } - } - - text_messages.messages.push(flc!( - "core_loading_from_cache", - generate_translation_hashmap(vec![("number", hashmap_loaded_entries.values().map(std::vec::Vec::len).sum::().to_string())]) - )); - - return Some(hashmap_loaded_entries); - } - None -} - pub trait MyHasher { fn update(&mut self, bytes: &[u8]); fn finalize(&self) -> String; @@ -1437,11 +1344,6 @@ fn hash_calculation(buffer: &mut [u8], file_entry: &FileEntry, hash_type: &HashT Ok(hasher.finalize()) } -fn get_file_hash_name(type_of_hash: &HashType, is_prehash: bool) -> String { - let prehash_str = if is_prehash { "_prehash" } else { "" }; - format!("cache_duplicates_{type_of_hash:?}{prehash_str}.txt") -} - impl MyHasher for blake3::Hasher { fn update(&mut self, bytes: &[u8]) { self.update(bytes); @@ -1486,6 +1388,7 @@ mod tests { use std::os::fs::MetadataExt; #[cfg(target_family = "unix")] use std::os::unix::fs::MetadataExt; + use std::path::PathBuf; use super::*; diff --git a/czkawka_gui/src/connect_things/connect_settings.rs b/czkawka_gui/src/connect_things/connect_settings.rs index f9f3249a8..1f3a82796 100644 --- a/czkawka_gui/src/connect_things/connect_settings.rs +++ b/czkawka_gui/src/connect_things/connect_settings.rs @@ -2,7 +2,8 @@ use std::collections::BTreeMap; use std::default::Default; use czkawka_core::common_cache::{ - get_duplicate_cache_file, get_similar_images_cache_file, get_similar_videos_cache_file, load_cache_from_file_generalized_by_path, save_cache_to_file_generalized, + get_duplicate_cache_file, get_similar_images_cache_file, get_similar_videos_cache_file, load_cache_from_file_generalized_by_path, load_cache_from_file_generalized_by_size, + save_cache_to_file_generalized, }; use directories_next::ProjectDirs; use gtk4::prelude::*; @@ -124,7 +125,13 @@ pub fn connect_settings(gui_data: &GuiData) { let mut messages: Messages = Messages::new(); for use_prehash in [true, false] { for type_of_hash in &[HashType::Xxh3, HashType::Blake3, HashType::Crc32] { - if let Some(cache_entries) = czkawka_core::duplicate::load_hashes_from_file(&mut messages, true, type_of_hash, use_prehash) { + let (mut messages, loaded_items) = load_cache_from_file_generalized_by_size::( + &get_duplicate_cache_file(type_of_hash, use_prehash), + true, + &Default::default(), + ); + + if let Some(cache_entries) = loaded_items { let mut hashmap_to_save: BTreeMap = Default::default(); for (_, vec_file_entry) in cache_entries { for file_entry in vec_file_entry { @@ -134,7 +141,9 @@ pub fn connect_settings(gui_data: &GuiData) { let minimal_cache_size = entry_settings_cache_file_minimal_size.text().as_str().parse::().unwrap_or(2 * 1024 * 1024); - save_cache_to_file_generalized(&get_duplicate_cache_file(type_of_hash, use_prehash), &hashmap_to_save, false, minimal_cache_size); + let save_messages = + save_cache_to_file_generalized(&get_duplicate_cache_file(type_of_hash, use_prehash), &hashmap_to_save, false, minimal_cache_size); + messages.extend_with_another_messages(save_messages); } }