From 745cdb8db91316a3ae65b52e41677aaf6ae90cc0 Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Tue, 9 Jul 2024 12:17:07 -0400 Subject: [PATCH] feat: better error message when barcode length differs from expected - barcode bases and lengths (actual and expected) and sample name are printed - fixes #31 by passing `Sample` to `count_mismatches()` instead of barcode string - all `count_mismatches()` tests updated to match new signature - minor miscellaneous comment typos fixed as well --- README.md | 2 +- src/bin/commands/demux.rs | 8 +- src/lib/barcode_matching.rs | 191 ++++++++++++++++++++++-------------- src/lib/samples.rs | 2 +- 4 files changed, 124 insertions(+), 79 deletions(-) diff --git a/README.md b/README.md index 372c1a4..3974e18 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ bases, and molecular identifiers from each read. The observed sample barcode will be matched to the sample barcodes extracted from the bases in the sample metadata and associated read structures. -An observed barcode matches an expected barcocde if all the following are true: +An observed barcode matches an expected barcode if all the following are true: 1. The number of mismatches (edits/substitutions) is less than or equal to the maximum mismatches (see --max-mismatches). diff --git a/src/bin/commands/demux.rs b/src/bin/commands/demux.rs index e2bec29..62a2d68 100644 --- a/src/bin/commands/demux.rs +++ b/src/bin/commands/demux.rs @@ -54,7 +54,7 @@ struct FastqSegment { #[derive(Eq, Hash, PartialEq, Debug, Clone, Copy)] enum SkipReason { - /// The read had to few bases for the segment. + /// The read had too few bases for the segment. TooFewBases, } @@ -526,7 +526,7 @@ impl DemuxMetric { /// molecular identifiers from each read. The observed sample barcode will be matched to the /// sample barcodes extracted from the bases in the sample metadata and associated read structures. /// -/// An observed barcode matches an expected barcocde if all the following are true: +/// An observed barcode matches an expected barcode if all the following are true: /// 1. The number of mismatches (edits/substitutions) is less than or equal to the maximum /// mismatches (see `--max-mismatches`). /// 2. The difference between number of mismatches in the best and second best barcodes is greater @@ -580,7 +580,7 @@ pub(crate) struct Demux { /// The read structure types to write to their own files (Must be one of T, B, or M for /// template reads, sample barcode reads, and molecular barcode reads). - /// + /// /// Multiple output types may be specified as a space-delimited list. #[clap(long, short='b', default_value="T", num_args = 1.. )] output_types: Vec, @@ -881,7 +881,7 @@ impl Command for Demux { // Setup the barcode matcher - the primary return from here is the index of the samp let mut barcode_matcher = BarcodeMatcher::new( - &sample_group.samples.iter().map(|s| s.barcode.as_str()).collect::>(), + &sample_group.samples, u8::try_from(self.max_mismatches)?, u8::try_from(self.min_mismatch_delta)?, true, diff --git a/src/lib/barcode_matching.rs b/src/lib/barcode_matching.rs index 4372c7c..c659dda 100644 --- a/src/lib/barcode_matching.rs +++ b/src/lib/barcode_matching.rs @@ -1,14 +1,14 @@ use super::byte_is_nocall; +use super::samples::Sample; use ahash::HashMap as AHashMap; use ahash::HashMapExt; -use bstr::{BString, ByteSlice}; const STARTING_CACHE_SIZE: usize = 1_000_000; /// The struct that contains the info related to the best and next best sample barcode match. #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub struct BarcodeMatch { - /// Index of the best bardcode match in the corresponding BarcodeMatcher struct that generated + /// Index of the best barcode match in the corresponding BarcodeMatcher struct that generated /// this match. pub best_match: usize, /// The number of mismatches to the best matching barcode for the read described by this match. @@ -21,10 +21,8 @@ pub struct BarcodeMatch { /// The struct responsible for matching barcodes to a ``Vec`` of sample barcodes. #[derive(Clone, Debug)] pub struct BarcodeMatcher { - /// Vec of the barcodes for each sample - /// Note - this is to be replaced by a sample struct in task 3. For now we're keeping things - /// very simple. - sample_barcodes: Vec, + /// Vec storing each sample + samples: Vec, /// The maximum mismatches to match a sample barcode. max_mismatches: u8, /// The minimum difference between number of mismatches in the best and second best barcodes @@ -37,31 +35,31 @@ pub struct BarcodeMatcher { } impl BarcodeMatcher { - /// Instantiates a new ``BarcodeMatcher`` struct. Checks that the sample barcodes vector is not + /// Instantiates a new ``BarcodeMatcher`` struct. Checks that the samples vector is not /// empty and that none of the barcodes provided are the empty string. /// /// # Panics - /// - Will panic if provided an empty vec of sample barcodes. + /// - Will panic if provided an empty vec of samples. /// - Will panic if any provided barcode is length zero. #[must_use] pub fn new( - sample_barcodes: &[&str], + samples: &[Sample], max_mismatches: u8, min_mismatch_delta: u8, use_cache: bool, ) -> Self { - assert!(!sample_barcodes.is_empty(), "Must provide at least one sample barcode"); + assert!(!samples.is_empty(), "Must provide at least one sample"); assert!( - sample_barcodes.iter().all(|b| !b.is_empty()), + samples.iter().all(|b| !b.barcode.is_empty()), "Sample barcode cannot be empty string" ); - let modified_sample_barcodes = sample_barcodes - .iter() - .map(|barcode| BString::from(barcode.to_ascii_uppercase())) - .collect::>(); + let mut modified_samples = samples.to_vec(); + for sample in modified_samples.iter_mut() { + sample.barcode = sample.barcode.to_ascii_uppercase(); + } Self { - sample_barcodes: modified_sample_barcodes, + samples: modified_samples, max_mismatches, min_mismatch_delta, use_cache, @@ -70,13 +68,19 @@ impl BarcodeMatcher { } /// Counts the number of bases that differ between two byte arrays. - fn count_mismatches(observed_bases: &[u8], expected_bases: &[u8]) -> u8 { + fn count_mismatches(observed_bases: &[u8], sample: &Sample) -> u8 { + let expected_bases = sample.barcode.as_bytes(); + let observed_string = + std::str::from_utf8(observed_bases).expect("Observed bases are not valid UTF-8"); assert_eq!( observed_bases.len(), expected_bases.len(), - "observed_bases: {}, expected_bases: {}", + "Read barcode ({}) length ({}) differs from expected barcode ({}) length ({}) for sample {}", + observed_string, observed_bases.len(), - expected_bases.len() + sample.barcode, + expected_bases.len(), + sample.sample_id ); let mut count: usize = 0; for (&expected_base, &observed_base) in expected_bases.iter().zip(observed_bases.iter()) { @@ -84,22 +88,22 @@ impl BarcodeMatcher { count += 1; } } - u8::try_from(count).expect("Overflow on number of mismatch bases") + u8::try_from(count).expect("Overflow on number of mismatched bases") } /// Returns the expected barcode length, assuming a fixed length for all samples. fn expected_barcode_length(&self) -> usize { - self.sample_barcodes[0].len() + self.samples[0].barcode.len() } /// Assigns the barcode that best matches the provided ``read_bases``. #[must_use] fn assign_internal(&self, read_bases: &[u8]) -> Option { - let mut best_barcode_index = self.sample_barcodes.len(); + let mut best_barcode_index = self.samples.len(); let mut best_mismatches = 255u8; let mut next_best_mismatches = 255u8; - for (index, sample_barcode) in self.sample_barcodes.iter().enumerate() { - let mismatches = Self::count_mismatches(read_bases, sample_barcode.as_bstr()); + for (index, sample) in self.samples.iter().enumerate() { + let mismatches = Self::count_mismatches(read_bases, sample); if mismatches < best_mismatches { next_best_mismatches = best_mismatches; @@ -155,77 +159,118 @@ mod tests { use super::*; use rstest::rstest; + /// Given a barcode and integer identifier, generate a sample instance. This helper function + /// allows creating more succinct testing code. + fn barcode_to_sample(barcode: &str, idx: usize) -> Sample { + Sample::new(idx, format!("sample_{idx}").to_string(), barcode.to_string()) + } + + /// Create a vector of samples from a list of barcodes, for more succinct testing code. + fn barcodes_to_samples(barcodes: &[&str]) -> Vec { + barcodes + .iter() + .enumerate() + .map(|(idx, barcode)| barcode_to_sample(barcode, idx)) + .collect::>() + } + // ############################################################################################ - // Test ``BarcodeMatcher`` instantiation panics. + // Test ``BarcodeMatcher`` instantiation // ############################################################################################ #[rstest] #[case(true)] #[case(false)] fn test_barcode_matcher_instantiation_can_succeed(#[case] use_cache: bool) { - let sample_barcodes = vec!["AGCT"]; - let _matcher = BarcodeMatcher::new(&sample_barcodes, 2, 1, use_cache); + let samples = barcodes_to_samples(&["ACGT"]); + let _matcher = BarcodeMatcher::new(&samples, 2, 1, use_cache); } #[rstest] #[case(true)] #[case(false)] - #[should_panic(expected = "Must provide at least one sample barcode")] - fn test_barcode_matcher_fails_if_no_sample_barcodes_provided(#[case] use_cache: bool) { - let sample_barcodes: Vec<&str> = vec![]; - let _matcher = BarcodeMatcher::new(&sample_barcodes, 2, 1, use_cache); - } - - #[rstest] - #[case(true)] - #[case(false)] - #[should_panic(expected = "Sample barcode cannot be empty string")] - fn test_barcode_matcher_fails_if_empty_sample_barcode_provided(#[case] use_cache: bool) { - let sample_barcodes = vec!["AGCT", ""]; - let _matcher = BarcodeMatcher::new(&sample_barcodes, 2, 1, use_cache); + #[should_panic(expected = "Must provide at least one sample")] + fn test_barcode_matcher_fails_if_no_samples_provided(#[case] use_cache: bool) { + let samples = barcodes_to_samples(&[]); + let _matcher = BarcodeMatcher::new(&samples, 2, 1, use_cache); } // ############################################################################################ // Test BarcodeMatcher::count_mismatches // ############################################################################################ - // Thought process behind not panicking on empty string is: - // 1. sample barcodes are checked to see if they are empty and sample vs read barcodes are - // compared for length, so empty string will fail anyway - // 2. the fewer operations / better optimization in this core matching function the better. + // NB: no need to test for empty barcode in sample (not allowed per 'samples.rs'), + // instead test that mismatch counting fails for empty read barcodes #[test] - fn empty_string_can_run_in_count_mismatches() { - assert_eq!(BarcodeMatcher::count_mismatches("".as_bytes(), "".as_bytes()), 0); + #[should_panic( + expected = "Read barcode () length (0) differs from expected barcode (CTATGT) length (6) for sample sample_0" + )] + fn empty_read_barcode_fails_length_mismatch() { + let _mismatches = + BarcodeMatcher::count_mismatches("".as_bytes(), &barcode_to_sample("CTATGT", 0)); } #[test] fn find_no_mismatches() { - assert_eq!(BarcodeMatcher::count_mismatches("GATTACA".as_bytes(), "GATTACA".as_bytes()), 0,); + assert_eq!( + BarcodeMatcher::count_mismatches( + "GATTACA".as_bytes(), + &barcode_to_sample("GATTACA", 0) + ), + 0 + ); } #[test] fn ns_in_expected_barcode_dont_contribute_to_mismatch_counter() { - assert_eq!(BarcodeMatcher::count_mismatches("GATTACA".as_bytes(), "GANNACA".as_bytes()), 0,); + assert_eq!( + BarcodeMatcher::count_mismatches( + "GATTACA".as_bytes(), + &barcode_to_sample("GANNACA", 0) + ), + 0 + ); } #[test] fn find_two_mismatches() { - assert_eq!(BarcodeMatcher::count_mismatches("GATTACA".as_bytes(), "GACCACA".as_bytes()), 2,); + assert_eq!( + BarcodeMatcher::count_mismatches( + "GATTACA".as_bytes(), + &barcode_to_sample("GACCACA", 0) + ), + 2 + ); } #[test] fn not_count_no_calls() { - assert_eq!(BarcodeMatcher::count_mismatches("GATTACA".as_bytes(), "GANNACA".as_bytes()), 0,); + assert_eq!( + BarcodeMatcher::count_mismatches( + "GATTACA".as_bytes(), + &barcode_to_sample("GANNACA", 0) + ), + 0 + ); } #[test] fn find_compare_two_sequences_that_have_all_mismatches() { - assert_eq!(BarcodeMatcher::count_mismatches("GATTACA".as_bytes(), "CTAATGT".as_bytes()), 7,); + assert_eq!( + BarcodeMatcher::count_mismatches( + "GATTACA".as_bytes(), + &barcode_to_sample("CTAATGT", 0) + ), + 7 + ); } #[test] - #[should_panic(expected = "observed_bases: 5, expected_bases: 6")] + #[should_panic( + expected = "Read barcode (GATTA) length (5) differs from expected barcode (CTATGT) length (6) for sample sample_0" + )] fn find_compare_two_sequences_of_different_length() { - let _mismatches = BarcodeMatcher::count_mismatches("GATTA".as_bytes(), "CTATGT".as_bytes()); + let _mismatches = + BarcodeMatcher::count_mismatches("GATTA".as_bytes(), &barcode_to_sample("CTATGT", 0)); } // ############################################################################################ @@ -237,10 +282,10 @@ mod tests { #[case(false)] fn test_assign_exact_match(#[case] use_cache: bool) { const EXPECTED_BARCODE_INDEX: usize = 0; - let sample_barcodes = vec!["ACGT", "AAAG", "CACA"]; - let mut matcher = BarcodeMatcher::new(&sample_barcodes, 2, 2, use_cache); + let samples = barcodes_to_samples(&["ACGT", "AAAG", "CACA"]); + let mut matcher = BarcodeMatcher::new(&samples, 2, 2, use_cache); assert_eq!( - matcher.assign(sample_barcodes[EXPECTED_BARCODE_INDEX].as_bytes()), + matcher.assign(samples[EXPECTED_BARCODE_INDEX].barcode.as_bytes()), Some(BarcodeMatch { best_match: EXPECTED_BARCODE_INDEX, best_mismatches: 0, @@ -253,8 +298,8 @@ mod tests { #[case(true)] #[case(false)] fn test_assign_imprecise_match(#[case] use_cache: bool) { - let sample_barcodes = vec!["AAAT", "AGAG", "CACA"]; - let mut matcher = BarcodeMatcher::new(&sample_barcodes, 2, 2, use_cache); + let samples = barcodes_to_samples(&["AAAT", "AGAG", "CACA"]); + let mut matcher = BarcodeMatcher::new(&samples, 2, 2, use_cache); // 1 different base // | // v @@ -267,8 +312,8 @@ mod tests { #[case(true)] #[case(false)] fn test_assign_precise_match_with_no_call(#[case] use_cache: bool) { - let sample_barcodes = vec!["AAAT", "AGAG", "CACA"]; - let mut matcher = BarcodeMatcher::new(&sample_barcodes, 2, 2, use_cache); + let samples = barcodes_to_samples(&["AAAT", "AGAG", "CACA"]); + let mut matcher = BarcodeMatcher::new(&samples, 2, 2, use_cache); // 1 no-call // | // v @@ -281,8 +326,8 @@ mod tests { #[case(true)] #[case(false)] fn test_assign_imprecise_match_with_no_call(#[case] use_cache: bool) { - let sample_barcodes = vec!["AAATTT", "AGAGGG", "CACAGG"]; - let mut matcher = BarcodeMatcher::new(&sample_barcodes, 2, 2, use_cache); + let samples = barcodes_to_samples(&["AAATTT", "AGAGGG", "CACAGG"]); + let mut matcher = BarcodeMatcher::new(&samples, 2, 2, use_cache); // 1 no-call // | // | 1 different base @@ -297,8 +342,8 @@ mod tests { #[case(true)] #[case(false)] fn test_sample_no_call_doesnt_contribute_to_mismatch_number(#[case] use_cache: bool) { - let sample_barcodes = vec!["NAGTTT", "AGAGGG", "CACAGG"]; - let mut matcher = BarcodeMatcher::new(&sample_barcodes, 1, 2, use_cache); + let samples = barcodes_to_samples(&["NAGTTT", "AGAGGG", "CACAGG"]); + let mut matcher = BarcodeMatcher::new(&samples, 1, 2, use_cache); // 1 no-call // | // | 1 different base @@ -314,8 +359,8 @@ mod tests { #[case(true)] #[case(false)] fn test_read_no_call_contributes_to_mismatch_number(#[case] use_cache: bool) { - let sample_barcodes = vec!["AAATTT", "AGAGGG", "CACAGG"]; - let mut matcher = BarcodeMatcher::new(&sample_barcodes, 1, 2, use_cache); + let samples = barcodes_to_samples(&["AAATTT", "AGAGGG", "CACAGG"]); + let mut matcher = BarcodeMatcher::new(&samples, 1, 2, use_cache); // 1 no-call // | // | 1 different base @@ -329,9 +374,9 @@ mod tests { #[case(true)] #[case(false)] fn test_produce_no_match_if_too_many_mismatches(#[case] use_cache: bool) { - let sample_barcodes = vec!["AAGCTAG", "CAGCTAG", "GAGCTAG", "TAGCTAG"]; + let samples = barcodes_to_samples(&["AAGCTAG", "CAGCTAG", "GAGCTAG", "TAGCTAG"]); let assignment_barcode: &[u8] = b"ATCGATC"; - let mut matcher = BarcodeMatcher::new(&sample_barcodes, 0, 100, use_cache); + let mut matcher = BarcodeMatcher::new(&samples, 0, 100, use_cache); assert_eq!(matcher.assign(assignment_barcode), None); } @@ -339,9 +384,9 @@ mod tests { #[case(true)] #[case(false)] fn test_produce_no_match_if_within_mismatch_delta(#[case] use_cache: bool) { - let sample_barcodes = vec!["AAAAAAAA", "CCCCCCCC", "GGGGGGGG", "GGGGGGTT"]; - let assignment_barcode: &[u8] = sample_barcodes[3].as_bytes(); - let mut matcher = BarcodeMatcher::new(&sample_barcodes, 100, 3, use_cache); + let samples = barcodes_to_samples(&["AAAAAAAA", "CCCCCCCC", "GGGGGGGG", "GGGGGGTT"]); + let assignment_barcode: &[u8] = samples[3].barcode.as_bytes(); + let mut matcher = BarcodeMatcher::new(&samples, 100, 3, use_cache); assert_eq!(matcher.assign(assignment_barcode), None); } @@ -349,9 +394,9 @@ mod tests { #[case(true)] #[case(false)] fn test_produce_no_match_if_too_many_mismatches_via_nocalls(#[case] use_cache: bool) { - let sample_barcodes = vec!["AAAAAAAA", "CCCCCCCC", "GGGGGGGG", "GGGGGGTT"]; + let samples = barcodes_to_samples(&["AAAAAAAA", "CCCCCCCC", "GGGGGGGG", "GGGGGGTT"]); let assignment_barcode: &[u8] = b"GGGGGGTN"; - let mut matcher = BarcodeMatcher::new(&sample_barcodes, 0, 100, use_cache); + let mut matcher = BarcodeMatcher::new(&samples, 0, 100, use_cache); assert_eq!(matcher.assign(assignment_barcode), None); } } diff --git a/src/lib/samples.rs b/src/lib/samples.rs index 0fc6f8f..766f835 100644 --- a/src/lib/samples.rs +++ b/src/lib/samples.rs @@ -90,7 +90,7 @@ impl Display for SampleGroup { impl SampleGroup { /// Validates a group of [`Sample`]s and instantiates a [`Self`] struct if they are /// valid. Will clone the [`Sample`] structs and change the number on the `ordinal` field on - /// those cloneto match the order in which they are stored in this [`Self`] + /// those cloned to match the order in which they are stored in this [`Self`] /// # Panics /// - Will panic if sample metadata sheet is improperly formatted /// - Will panic if there are duplicate sample names provided