Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix early return and counting of mismatches with no-calls in barcodes. #48

Merged
merged 2 commits into from
Aug 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 117 additions & 0 deletions src/bin/commands/demux.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1371,6 +1371,123 @@ mod tests {
);
}

#[test]
fn test_demux_with_catchall_barcode() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("7B+T").unwrap()];
let s1_barcode = "NNNNNNN";
let sample_metadata = metadata_file(&tmp, &[s1_barcode]);
let input_files =
vec![fastq_file(&tmp, "ex", "ex", &[&(s1_barcode.to_owned() + &"A".repeat(100))])];

let output_dir = tmp.path().to_path_buf().join("output");

let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 0,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
};
demux_inputs.execute().unwrap();

let unmatched_path = output_dir.join("unmatched.R1.fq.gz");
let unmatched_reads = read_fastq(&unmatched_path);
assert_eq!(unmatched_reads.len(), 0);

let output_path = output_dir.join("Sample0000.R1.fq.gz");
let fq_reads = read_fastq(&output_path);

assert_eq!(fq_reads.len(), 1);
assert_equal(
&fq_reads[0],
&OwnedRecord {
head: b"ex_0 1:N:0:NNNNNNN".to_vec(),
seq: "A".repeat(100).as_bytes().to_vec(),
qual: ";".repeat(100).as_bytes().to_vec(),
},
);
}

#[test]
fn test_demux_with_ns_in_barcode() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("7B+T").unwrap()];
let s1_barcode = "NNAAAAA";
let s2_barcode = "NNCCCCC";
let sample_metadata = metadata_file(&tmp, &[s1_barcode, s2_barcode]);
let input_files = vec![fastq_file(
&tmp,
"ex",
"ex",
&[
&("ANAAAAA".to_owned() + &"A".repeat(5)),
&("ANCCCCC".to_owned() + &"C".repeat(5)),
&("NNNAAAA".to_owned() + &"T".repeat(5)),
],
)];

let output_dir = tmp.path().to_path_buf().join("output");

let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 0,
min_mismatch_delta: 0,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
};
demux_inputs.execute().unwrap();

let output_path = output_dir.join("Sample0000.R1.fq.gz");
let fq_reads = read_fastq(&output_path);
assert_eq!(fq_reads.len(), 1);
assert_equal(
&fq_reads[0],
&OwnedRecord {
head: b"ex_0 1:N:0:ANAAAAA".to_vec(),
seq: "A".repeat(5).as_bytes().to_vec(),
qual: ";".repeat(5).as_bytes().to_vec(),
},
);

let output_path = output_dir.join("Sample0001.R1.fq.gz");
let fq_reads = read_fastq(&output_path);
assert_eq!(fq_reads.len(), 1);
assert_equal(
&fq_reads[0],
&OwnedRecord {
head: b"ex_1 1:N:0:ANCCCCC".to_vec(),
seq: "C".repeat(5).as_bytes().to_vec(),
qual: ";".repeat(5).as_bytes().to_vec(),
},
);

// Should not match since it has 3 no calls, and barcodes have at maximum 1 no-call
let unmatched_path = output_dir.join("unmatched.R1.fq.gz");
let unmatched_reads = read_fastq(&unmatched_path);
assert_eq!(unmatched_reads.len(), 1);
assert_equal(
&unmatched_reads[0],
&OwnedRecord {
head: b"ex_2 1:N:0:NNNAAAA".to_vec(),
seq: "T".repeat(5).as_bytes().to_vec(),
qual: ";".repeat(5).as_bytes().to_vec(),
},
);
}

#[test]
fn test_demux_paired_reads_with_in_line_sample_barcodes() {
let tmp = TempDir::new().unwrap();
Expand Down
19 changes: 17 additions & 2 deletions src/lib/barcode_matching.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ pub struct BarcodeMatcher {
/// Note - this is to be replaced by a sample struct in task 3. For now we're keeping things
/// very simple.
sample_barcodes: Vec<BString>,
/// The maxium number of Ns in any barcode in set of sample barcodes
max_ns_in_barcodes: usize,
/// The maximum mismatches to match a sample barcode.
max_mismatches: u8,
/// The minimum difference between number of mismatches in the best and second best barcodes
Expand Down Expand Up @@ -56,12 +58,20 @@ impl BarcodeMatcher {
"Sample barcode cannot be empty string"
);

let mut max_ns_in_barcodes = 0;
let modified_sample_barcodes = sample_barcodes
.iter()
.map(|barcode| BString::from(barcode.to_ascii_uppercase()))
.map(|barcode| {
let barcode = BString::from(barcode.to_ascii_uppercase());
let num_ns = barcode.iter().filter(|&b| byte_is_nocall(*b)).count();
max_ns_in_barcodes = max_ns_in_barcodes.max(num_ns);
barcode
})
.collect::<Vec<_>>();

Self {
sample_barcodes: modified_sample_barcodes,
max_ns_in_barcodes,
max_mismatches,
min_mismatch_delta,
use_cache,
Expand Down Expand Up @@ -132,7 +142,7 @@ impl BarcodeMatcher {
return None;
}
let num_no_calls = read_bases.iter().filter(|&&b| byte_is_nocall(b)).count();
if num_no_calls > self.max_mismatches as usize {
if num_no_calls > (self.max_mismatches as usize) + self.max_ns_in_barcodes {
None
} else if self.use_cache {
if let Some(cached_match) = self.cache.get(read_bases) {
Expand Down Expand Up @@ -207,6 +217,11 @@ mod tests {
assert_eq!(BarcodeMatcher::count_mismatches("GATTACA".as_bytes(), "GANNACA".as_bytes()), 0,);
}

#[test]
fn all_ns_barcode_have_no_mismatches() {
assert_eq!(BarcodeMatcher::count_mismatches("GANNACA".as_bytes(), "NNNNNNN".as_bytes()), 0,);
}

#[test]
fn find_two_mismatches() {
assert_eq!(BarcodeMatcher::count_mismatches("GATTACA".as_bytes(), "GACCACA".as_bytes()), 2,);
Expand Down
Loading