diff --git a/src/index/components/hyperkmers_counts/mod.rs b/src/index/components/hyperkmers_counts/mod.rs index 36a2fb5..0c6fdf9 100644 --- a/src/index/components/hyperkmers_counts/mod.rs +++ b/src/index/components/hyperkmers_counts/mod.rs @@ -664,8 +664,19 @@ fn insert_new_right_hyperkmer_and_compute_associated_metadata( let mut start = sequence.as_vec(); start.extend_from_slice(&vec![b'A'; nb_base_missing]); let full_sequence = Subsequence::new(&start, 0, k - 1, true); - // TODO prove it - assert!(full_sequence.is_canonical()); // Adding A at the end of a canonical read does not make it canonical + + #[cfg(debug_assertions)] + { + // Adding A at the end of a canonical read keeps it canonical + if !full_sequence.is_canonical() { + // oops, we juste changed the canonicity of this sequence... + // ... or did we ? + // maybe it was a palindrome all along ! + if !full_sequence.is_equal_to_its_revcomp() { + panic!("Adding A at the end of a canonical read made it canonical"); + } + } + } let (id_bucket, id_hk) = hyperkmers.add_new_ext_hyperkmer(&full_sequence); // create the associated metadata @@ -676,7 +687,19 @@ fn insert_new_right_hyperkmer_and_compute_associated_metadata( start.extend_from_slice(&vec![b'T'; nb_base_missing]); let full_sequence = Subsequence::new(&start, 0, k - 1, true); // TODO prove it - assert!(!full_sequence.is_canonical()); // Adding T at the end of a non canonical read does not make it canonical + + #[cfg(debug_assertions)] + { + // Adding T at the end of a non canonical read does not make it canonical + if full_sequence.is_canonical() { + // oops, we juste changed the canonicity of this sequence... + // ... or did we ? + // maybe it was a palindrome all along ! + if !full_sequence.is_equal_to_its_revcomp() { + panic!(); + } + } + } let (id_bucket, id_hk) = hyperkmers.add_new_ext_hyperkmer(&full_sequence); // create the associated metadata HKMetadata::new(id_bucket, id_hk, 0, sequence.len(), false, true) @@ -697,8 +720,20 @@ fn insert_new_left_hyperkmer_and_compute_associated_metadata( let mut start = vec![b'A'; nb_base_missing]; start.extend_from_slice(&sequence.as_vec()); let full_sequence = Subsequence::new(&start, 0, k - 1, true); - // TODO prove it - assert!(full_sequence.is_canonical()); // Adding T at the beginning of a non canonical read does not make it canonical + assert!(full_sequence.is_canonical()); // Adding A at the beginning of a canonical read keep it canonical + + #[cfg(debug_assertions)] + { + // Adding T at the beginning of a non canonical read does not make it canonical + if !full_sequence.is_canonical() { + // oops, we juste changed the canonicity of this sequence... + // ... or did we ? + // maybe it was a palindrome all along ! + if !full_sequence.is_equal_to_its_revcomp() { + panic!(); + } + } + } let (id_bucket, id_hk) = hyperkmers.add_new_ext_hyperkmer(&full_sequence); // create the associated metadata @@ -714,9 +749,19 @@ fn insert_new_left_hyperkmer_and_compute_associated_metadata( } else { let mut start = vec![b'T'; nb_base_missing]; start.extend_from_slice(&sequence.as_vec()); - let full_sequence = Subsequence::new(&start, 0, k - 1, true); - // TODO prove it - assert!(!full_sequence.is_canonical()); // Adding T at the beginning of a non canonical read does not make it canonical + let full_sequence = Subsequence::new(&start, 0, k - 1, true); // DEBUG + #[cfg(debug_assertions)] + { + // Adding T at the beginning of a non canonical read does not make it canonical + if full_sequence.is_canonical() { + // oops, we juste changed the canonicity of this sequence... + // ... or did we ? + // maybe it was a palindrome all along ! + if !full_sequence.is_equal_to_its_revcomp() { + panic!(); + } + } + } let (id_bucket, id_hk) = hyperkmers.add_new_ext_hyperkmer(&full_sequence); // create the associated metadata HKMetadata::new( diff --git a/src/index/mod.rs b/src/index/mod.rs index 0697923..3346a1d 100644 --- a/src/index/mod.rs +++ b/src/index/mod.rs @@ -5,16 +5,15 @@ mod extraction; use super::Minimizer; use crate::serde::kff::{build_values, create_block, write_blocks, KFFError, ENCODING}; -use crate::Count; use crate::{ compute_left_and_right::get_left_and_rigth_of_sk, superkmers_computation::compute_superkmers_linear_streaming, }; +use crate::{format_number_u64, Count}; use extraction::{extract_context, extract_kmers_from_contexts_associated_to_a_minimizer}; -use components::{HKCount, LargeExtendedHyperkmer, ParallelExtendedHyperkmers, SuperKmerCounts}; - use crate::buckets::Buckets; +use components::{HKCount, LargeExtendedHyperkmer, ParallelExtendedHyperkmers, SuperKmerCounts}; use computation::{first_stage, second_stage}; use kff::Kff; use rayon::prelude::*; @@ -30,6 +29,8 @@ use std::io::BufWriter; use std::io::Write; use std::marker::PhantomData; use std::path::Path; +use std::sync::atomic::AtomicUsize; +use std::sync::atomic::{AtomicU64, Ordering::SeqCst}; use std::sync::RwLock; use std::sync::{Arc, Mutex}; use std::time::Instant; @@ -182,13 +183,16 @@ impl Index { /// Constructs a new `Index` indexing a set of sequences #[allow(clippy::self_named_constructors)] // Self named constructor ? I want it that way 🎵 pub fn index>(k: usize, m: usize, threshold: Count, path: P) -> Self { - let start_fisrt_step = Instant::now(); + let start_fisrt_stage = Instant::now(); let (super_kmer_counts, hk_count, hyperkmers, large_hyperkmers) = first_stage(&path, k, m, threshold); + let time_first_stage = start_fisrt_stage.elapsed().as_secs(); println!( - "time first stage: {} milliseconds", - start_fisrt_step.elapsed().as_millis() + "time first stage: {} second{}", + format_number_u64(time_first_stage), + if time_first_stage > 1 { "s" } else { "" } ); + let start_second_stage = Instant::now(); let discarded_minimizers = second_stage( &super_kmer_counts, @@ -200,9 +204,11 @@ impl Index { m, threshold, ); + let time_second_stage = start_second_stage.elapsed().as_secs(); println!( - "time second stage: {} milliseconds", - start_second_stage.elapsed().as_millis() + "time second stage: {} second{}", + format_number_u64(time_second_stage), + if time_second_stage > 1 { "s" } else { "" } ); Self { @@ -338,6 +344,37 @@ where }); } + pub fn count_minimizers_and_kmers(&self) -> (usize, usize) { + let nb_minimizers = AtomicUsize::new(0); + let nb_kmers = AtomicUsize::new(0); + let hk_count_chunks = self.hk_count.chunks(); + + hk_count_chunks.par_iter().for_each(|chunk| { + let k = self.k; + let m = self.m; + let hyperkmers = self.hyperkmers.read().unwrap(); + let large_hyperkmers = self.large_hyperkmers.read().unwrap(); + let hk_count_chunk = chunk.read().unwrap(); + + let mut km_counts_grouped_by_key = + hk_count_chunk.get_data().iter_group_by_key().peekable(); + + while let Some(kmers) = extract_kmers_from_contexts_associated_to_a_minimizer( + &mut km_counts_grouped_by_key, + &hyperkmers, + &large_hyperkmers, + &k, + &m, + ) { + nb_minimizers.fetch_add(1, SeqCst); + nb_kmers.fetch_add(kmers.len(), SeqCst); + } + }); + let nb_minimizers = nb_minimizers.load(SeqCst); + let nb_kmers = nb_kmers.load(SeqCst); + (nb_minimizers, nb_kmers) + } + pub fn par_write_kff>(&self, path: P) -> Result<(), KFFError> { // TODO discuss: our kmers are non unique and canonical, right ? let header = kff::section::Header::new(1, 0, ENCODING, false, true, b"".to_vec()) diff --git a/src/main.rs b/src/main.rs index cec260d..18638af 100644 --- a/src/main.rs +++ b/src/main.rs @@ -11,7 +11,7 @@ use std::collections::HashMap; use std::fs::File; use std::io::Write; use std::io::{BufRead, BufReader, BufWriter}; -use std::path::Path; +use std::path::{Path, PathBuf}; use superkmers_computation::is_canonical; type Minimizer = u64; @@ -78,7 +78,7 @@ struct BuildArgs { /// Input file (FASTA/Q, possibly gzipped) #[arg(short, long)] input: String, - /// Output file (no dump by default) + /// Output file ({input}.kff by default) #[arg(short, long)] output: Option, /// Number of threads (use all core by default) @@ -87,7 +87,7 @@ struct BuildArgs { /// Check against the results of KMC (no check by default) #[arg(long)] check_kmc: Option, - /// Dump full index, with superkmer informations (partial dump by default) + /// Expert parameter: dump additional information with the index. KFC cannot read these indexes from the CLI. (partial dump by default) #[arg(short, long)] fulldump: bool, /// Print some statistics about the index (no print by default) @@ -130,6 +130,50 @@ struct KFFDumpArgs { threads: Option, } +/// Formats a u64 in a String +fn format_number_u64(number: u64) -> String { + number + .to_string() + .as_bytes() + .rchunks(3) + .rev() + .map(std::str::from_utf8) + .collect::, _>>() + .unwrap() + .join(",") +} + +/// Formats a usize in a String +fn format_number_usize(number: usize) -> String { + number + .to_string() + .as_bytes() + .rchunks(3) + .rev() + .map(std::str::from_utf8) + .collect::, _>>() + .unwrap() + .join(",") +} + +/// Checks a file exists. Exits the program if the path is not a file. +fn check_file_exists

(filepath: P) +where + P: AsRef, +{ + if filepath.as_ref().exists() { + if !filepath.as_ref().is_file() { + let filepath = filepath.as_ref().to_str().unwrap(); + eprintln!("Error: '{filepath}' is not a file."); + std::process::exit(1); + } + } else { + let filepath = filepath.as_ref().to_str().unwrap(); + eprintln!("Error: file '{filepath}' does not exist."); + std::process::exit(1); + } +} + /// Prints some statistics about the index fn print_stats( index: &Index, @@ -143,17 +187,38 @@ fn print_stats( .read() .expect("could not acquire read lock"); - let nb_base_in_large_hyperkmers: usize = - large_hyperkmers.iter().map(|large_hk| large_hk.0).sum(); let number_of_hyperkmers = hyperkmers.get_nb_inserted(); let number_of_large_hyperkmers = large_hyperkmers.len(); + let nb_base_in_hyperkmers: usize = number_of_hyperkmers * (k - 1); + let nb_base_in_large_hyperkmers: usize = + large_hyperkmers.iter().map(|large_hk| large_hk.0).sum(); + let (nb_minimizers, nb_kmers) = index.count_minimizers_and_kmers(); + println!("===== stats ====="); - p!(number_of_hyperkmers); - p!(number_of_large_hyperkmers); - println!("nb bases in hyperkmers: {}", number_of_hyperkmers * (k - 1)); println!( - "nb base in large hyperkmers: {}", - nb_base_in_large_hyperkmers + "indexed {} kmers with {} minimizers", + format_number_usize(nb_minimizers), + format_number_usize(nb_kmers) + ); + println!( + "number of hyperkmers: {}", + format_number_usize(number_of_hyperkmers + number_of_large_hyperkmers) + ); + + #[cfg(debug_assertions)] + println!( + " [debug] including {} large hyperkmers", + format_number_usize(number_of_large_hyperkmers) + ); + + println!( + "number of bases in hyperkmers: {}", + format_number_usize(nb_base_in_hyperkmers + nb_base_in_large_hyperkmers) + ); + #[cfg(debug_assertions)] + println!( + " [debug] including {} bases in large hyperkmers", + format_number_usize(nb_base_in_large_hyperkmers) ); } @@ -211,11 +276,24 @@ fn main() { match args.command { Command::Build(args) => { let k = args.k; - assert!(k % 2 == 1, "k must be odd"); let m = args.m; - assert!(m % 2 == 1, "m must be odd"); let threshold = args.threshold_superkmer; + // check that k and m are odd + if k % 2 != 1 { + eprintln!("Error: k must be odd."); + std::process::exit(1); + } + if m % 2 != 1 { + eprintln!("Error: m must be odd."); + std::process::exit(1); + } + + check_file_exists(&args.input); + if let Some(ref kmc_file) = args.check_kmc { + check_file_exists(kmc_file); + } + // set the number of threads if let Some(nb_threads) = args.threads { rayon::ThreadPoolBuilder::new() @@ -224,6 +302,17 @@ fn main() { .expect("unable to configure the threads"); } + // get output filename + let output_file = match args.output { + Some(filename) => filename, + None => { + let mut path = PathBuf::from(&args.input); + // Set the new extension or add one if there's none + path.set_extension("kfc"); + path.to_string_lossy().to_string() + } + }; + // build the index let index = Index::::index(k, m, threshold, args.input); // use KMC's output as a query against the index @@ -236,25 +325,23 @@ fn main() { } // write the index to disk - if let Some(output_file) = args.output { - if args.fulldump { - serde::bin::dump(&index, &output_file).expect("impossible to dump"); - - #[cfg(debug_assertions)] - { - let index2 = serde::bin::load(&output_file).expect("impossible to load"); - debug_assert!(index == index2); - } - } else { - // not a full dump: let's remove infos from the index - let index = index.remove_superkmer_infos(); - serde::bin::dump(&index, &output_file).expect("impossible to dump"); - - #[cfg(debug_assertions)] - { - let index2 = serde::bin::load(&output_file).expect("impossible to load"); - debug_assert!(index == index2); - } + if args.fulldump { + serde::bin::dump(&index, &output_file).expect("impossible to dump"); + + #[cfg(debug_assertions)] + { + let index2 = serde::bin::load(&output_file).expect("impossible to load"); + debug_assert!(index == index2); + } + } else { + // not a full dump: let's remove infos from the index + let index = index.remove_superkmer_infos(); + serde::bin::dump(&index, &output_file).expect("impossible to dump"); + + #[cfg(debug_assertions)] + { + let index2 = serde::bin::load(&output_file).expect("impossible to load"); + debug_assert!(index == index2); } } } @@ -262,6 +349,8 @@ fn main() { let input = args.input_index; let kmer_threshold = args.kmer_threshold; + check_file_exists(&input); + // set the number of threads if let Some(nb_threads) = args.threads { rayon::ThreadPoolBuilder::new() @@ -281,6 +370,8 @@ fn main() { Command::KFFDump(args) => { let kmer_threshold = args.kmer_threshold; + check_file_exists(&args.input_kff); + // set the number of threads if let Some(nb_threads) = args.threads { rayon::ThreadPoolBuilder::new() diff --git a/src/subsequence.rs b/src/subsequence.rs index 6cb60c1..9e2d57a 100644 --- a/src/subsequence.rs +++ b/src/subsequence.rs @@ -74,6 +74,14 @@ impl<'a> Subsequence> { self.same_orientation == is_original_subsequence_canonical } + #[cfg(debug_assertions)] + pub fn is_equal_to_its_revcomp(&self) -> bool { + use crate::superkmers_computation::is_equal_to_its_revcomp; + + let subsequence = &self.packing.read[self.start..self.end]; + is_equal_to_its_revcomp(subsequence) + } + pub fn to_canonical(self) -> Self { let subsequence = &self.packing.read[self.start..self.end]; diff --git a/src/superkmers_computation.rs b/src/superkmers_computation.rs index 76e5a8d..a3c3d44 100644 --- a/src/superkmers_computation.rs +++ b/src/superkmers_computation.rs @@ -40,6 +40,19 @@ pub fn is_canonical(seq: &[u8]) -> bool { true } +pub fn is_equal_to_its_revcomp(seq: &[u8]) -> bool { + let mut orientation_1 = same_orientation(seq); + let mut orientation_2 = reverse_complement(seq); + while let (Some(xc), Some(yc)) = (orientation_1.next(), orientation_2.next()) { + let xc = if unlikely(xc == b'N') { b'A' } else { xc }; + + if xc != yc { + return false; + } + } + true +} + pub struct SuperkmerIterator<'a> { sequence: &'a [u8], minimizer_iter: std::iter::Peekable>,