Skip to content

Commit

Permalink
Merge pull request #3 from lrobidou/dev
Browse files Browse the repository at this point in the history
Fix panic and add quality of life features
  • Loading branch information
lrobidou authored Oct 14, 2024
2 parents 1adaaaf + da89261 commit c624e01
Show file tree
Hide file tree
Showing 5 changed files with 241 additions and 47 deletions.
61 changes: 53 additions & 8 deletions src/index/components/hyperkmers_counts/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -664,8 +664,19 @@ fn insert_new_right_hyperkmer_and_compute_associated_metadata(
let mut start = sequence.as_vec();
start.extend_from_slice(&vec![b'A'; nb_base_missing]);
let full_sequence = Subsequence::new(&start, 0, k - 1, true);
// TODO prove it
assert!(full_sequence.is_canonical()); // Adding A at the end of a canonical read does not make it canonical

#[cfg(debug_assertions)]
{
// Adding A at the end of a canonical read keeps it canonical
if !full_sequence.is_canonical() {
// oops, we juste changed the canonicity of this sequence...
// ... or did we ?
// maybe it was a palindrome all along !
if !full_sequence.is_equal_to_its_revcomp() {
panic!("Adding A at the end of a canonical read made it canonical");
}
}
}

let (id_bucket, id_hk) = hyperkmers.add_new_ext_hyperkmer(&full_sequence);
// create the associated metadata
Expand All @@ -676,7 +687,19 @@ fn insert_new_right_hyperkmer_and_compute_associated_metadata(
start.extend_from_slice(&vec![b'T'; nb_base_missing]);
let full_sequence = Subsequence::new(&start, 0, k - 1, true);
// TODO prove it
assert!(!full_sequence.is_canonical()); // Adding T at the end of a non canonical read does not make it canonical

#[cfg(debug_assertions)]
{
// Adding T at the end of a non canonical read does not make it canonical
if full_sequence.is_canonical() {
// oops, we juste changed the canonicity of this sequence...
// ... or did we ?
// maybe it was a palindrome all along !
if !full_sequence.is_equal_to_its_revcomp() {
panic!();
}
}
}
let (id_bucket, id_hk) = hyperkmers.add_new_ext_hyperkmer(&full_sequence);
// create the associated metadata
HKMetadata::new(id_bucket, id_hk, 0, sequence.len(), false, true)
Expand All @@ -697,8 +720,20 @@ fn insert_new_left_hyperkmer_and_compute_associated_metadata(
let mut start = vec![b'A'; nb_base_missing];
start.extend_from_slice(&sequence.as_vec());
let full_sequence = Subsequence::new(&start, 0, k - 1, true);
// TODO prove it
assert!(full_sequence.is_canonical()); // Adding T at the beginning of a non canonical read does not make it canonical
assert!(full_sequence.is_canonical()); // Adding A at the beginning of a canonical read keep it canonical

#[cfg(debug_assertions)]
{
// Adding T at the beginning of a non canonical read does not make it canonical
if !full_sequence.is_canonical() {
// oops, we juste changed the canonicity of this sequence...
// ... or did we ?
// maybe it was a palindrome all along !
if !full_sequence.is_equal_to_its_revcomp() {
panic!();
}
}
}

let (id_bucket, id_hk) = hyperkmers.add_new_ext_hyperkmer(&full_sequence);
// create the associated metadata
Expand All @@ -714,9 +749,19 @@ fn insert_new_left_hyperkmer_and_compute_associated_metadata(
} else {
let mut start = vec![b'T'; nb_base_missing];
start.extend_from_slice(&sequence.as_vec());
let full_sequence = Subsequence::new(&start, 0, k - 1, true);
// TODO prove it
assert!(!full_sequence.is_canonical()); // Adding T at the beginning of a non canonical read does not make it canonical
let full_sequence = Subsequence::new(&start, 0, k - 1, true); // DEBUG
#[cfg(debug_assertions)]
{
// Adding T at the beginning of a non canonical read does not make it canonical
if full_sequence.is_canonical() {
// oops, we juste changed the canonicity of this sequence...
// ... or did we ?
// maybe it was a palindrome all along !
if !full_sequence.is_equal_to_its_revcomp() {
panic!();
}
}
}
let (id_bucket, id_hk) = hyperkmers.add_new_ext_hyperkmer(&full_sequence);
// create the associated metadata
HKMetadata::new(
Expand Down
53 changes: 45 additions & 8 deletions src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,15 @@ mod extraction;

use super::Minimizer;
use crate::serde::kff::{build_values, create_block, write_blocks, KFFError, ENCODING};
use crate::Count;
use crate::{
compute_left_and_right::get_left_and_rigth_of_sk,
superkmers_computation::compute_superkmers_linear_streaming,
};
use crate::{format_number_u64, Count};
use extraction::{extract_context, extract_kmers_from_contexts_associated_to_a_minimizer};

use components::{HKCount, LargeExtendedHyperkmer, ParallelExtendedHyperkmers, SuperKmerCounts};

use crate::buckets::Buckets;
use components::{HKCount, LargeExtendedHyperkmer, ParallelExtendedHyperkmers, SuperKmerCounts};
use computation::{first_stage, second_stage};
use kff::Kff;
use rayon::prelude::*;
Expand All @@ -30,6 +29,8 @@ use std::io::BufWriter;
use std::io::Write;
use std::marker::PhantomData;
use std::path::Path;
use std::sync::atomic::AtomicUsize;
use std::sync::atomic::{AtomicU64, Ordering::SeqCst};
use std::sync::RwLock;
use std::sync::{Arc, Mutex};
use std::time::Instant;
Expand Down Expand Up @@ -182,13 +183,16 @@ impl Index<CompleteIndex> {
/// Constructs a new `Index` indexing a set of sequences
#[allow(clippy::self_named_constructors)] // Self named constructor ? I want it that way 🎵
pub fn index<P: AsRef<Path>>(k: usize, m: usize, threshold: Count, path: P) -> Self {
let start_fisrt_step = Instant::now();
let start_fisrt_stage = Instant::now();
let (super_kmer_counts, hk_count, hyperkmers, large_hyperkmers) =
first_stage(&path, k, m, threshold);
let time_first_stage = start_fisrt_stage.elapsed().as_secs();
println!(
"time first stage: {} milliseconds",
start_fisrt_step.elapsed().as_millis()
"time first stage: {} second{}",
format_number_u64(time_first_stage),
if time_first_stage > 1 { "s" } else { "" }
);

let start_second_stage = Instant::now();
let discarded_minimizers = second_stage(
&super_kmer_counts,
Expand All @@ -200,9 +204,11 @@ impl Index<CompleteIndex> {
m,
threshold,
);
let time_second_stage = start_second_stage.elapsed().as_secs();
println!(
"time second stage: {} milliseconds",
start_second_stage.elapsed().as_millis()
"time second stage: {} second{}",
format_number_u64(time_second_stage),
if time_second_stage > 1 { "s" } else { "" }
);

Self {
Expand Down Expand Up @@ -338,6 +344,37 @@ where
});
}

pub fn count_minimizers_and_kmers(&self) -> (usize, usize) {
let nb_minimizers = AtomicUsize::new(0);
let nb_kmers = AtomicUsize::new(0);
let hk_count_chunks = self.hk_count.chunks();

hk_count_chunks.par_iter().for_each(|chunk| {
let k = self.k;
let m = self.m;
let hyperkmers = self.hyperkmers.read().unwrap();
let large_hyperkmers = self.large_hyperkmers.read().unwrap();
let hk_count_chunk = chunk.read().unwrap();

let mut km_counts_grouped_by_key =
hk_count_chunk.get_data().iter_group_by_key().peekable();

while let Some(kmers) = extract_kmers_from_contexts_associated_to_a_minimizer(
&mut km_counts_grouped_by_key,
&hyperkmers,
&large_hyperkmers,
&k,
&m,
) {
nb_minimizers.fetch_add(1, SeqCst);
nb_kmers.fetch_add(kmers.len(), SeqCst);
}
});
let nb_minimizers = nb_minimizers.load(SeqCst);
let nb_kmers = nb_kmers.load(SeqCst);
(nb_minimizers, nb_kmers)
}

pub fn par_write_kff<P: AsRef<Path>>(&self, path: P) -> Result<(), KFFError> {
// TODO discuss: our kmers are non unique and canonical, right ?
let header = kff::section::Header::new(1, 0, ENCODING, false, true, b"".to_vec())
Expand Down
Loading

0 comments on commit c624e01

Please sign in to comment.