Skip to content

Commit

Permalink
add back SA search tests (#22)
Browse files Browse the repository at this point in the history
  • Loading branch information
BramDevlaminck authored May 22, 2024
1 parent 87aa164 commit bf4dba0
Show file tree
Hide file tree
Showing 4 changed files with 367 additions and 1 deletion.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions sa-index/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dev-dependencies]
tempdir = "0.3.7"

[dependencies]
clap = { version = "4.4.8", features = ["derive"] }
umgap = "1.1.0"
Expand Down
362 changes: 362 additions & 0 deletions sa-index/src/sa_searcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -546,3 +546,365 @@ impl Searcher {
.get_all_functional_annotations(proteins)
}
}

#[cfg(test)]
mod tests {
use sa_mappings::functionality::FunctionAggregator;
use sa_mappings::proteins::{Protein, Proteins};
use sa_mappings::taxonomy::{AggregationMethod, TaxonAggregator};
use crate::sa_searcher::{
BoundSearchResult, SearchAllSuffixesResult, Searcher,
};
use crate::suffix_to_protein_index::SparseSuffixToProtein;
use tempdir::TempDir;
use std::{
fs::File,
io::Write,
path::PathBuf
};

fn create_taxonomy_file(tmp_dir: &TempDir) -> PathBuf {
let taxonomy_file = tmp_dir.path().join("taxonomy.tsv");
let mut file = File::create(&taxonomy_file).unwrap();

writeln!(file, "1\troot\tno rank\t1\t\x01").unwrap();
writeln!(file, "2\tBacteria\tsuperkingdom\t1\t\x01").unwrap();
writeln!(file, "6\tAzorhizobium\tgenus\t1\t\x01").unwrap();
writeln!(file, "7\tAzorhizobium caulinodans\tspecies\t6\t\x01").unwrap();
writeln!(file, "9\tBuchnera aphidicola\tspecies\t6\t\x01").unwrap();
writeln!(file, "10\tCellvibrio\tgenus\t6\t\x01").unwrap();
writeln!(file, "11\tCellulomonas gilvus\tspecies\t10\t\x01").unwrap();
writeln!(file, "13\tDictyoglomus\tgenus\t11\t\x01").unwrap();
writeln!(file, "14\tDictyoglomus thermophilum\tspecies\t10\t\x01").unwrap();
writeln!(file, "16\tMethylophilus\tgenus\t14\t\x01").unwrap();
writeln!(file, "17\tMethylophilus methylotrophus\tspecies\t16\t\x01").unwrap();
writeln!(file, "18\tPelobacter\tgenus\t17\t\x01").unwrap();
writeln!(file, "19\tSyntrophotalea carbinolica\tspecies\t17\t\x01").unwrap();
writeln!(file, "20\tPhenylobacterium\tgenus\t19\t\x01").unwrap();

taxonomy_file
}


fn get_example_proteins() -> Proteins {
let text = "AI-BLACVAA-AC-KCRLZ$".to_string().into_bytes();
Proteins {
input_string: text,
proteins: vec![
Protein {
uniprot_id: String::new(),
taxon_id: 0,
functional_annotations: vec![],
},
Protein {
uniprot_id: String::new(),
taxon_id: 0,
functional_annotations: vec![],
},
Protein {
uniprot_id: String::new(),
taxon_id: 0,
functional_annotations: vec![],
},
Protein {
uniprot_id: String::new(),
taxon_id: 0,
functional_annotations: vec![],
},
],
}
}

#[test]
fn test_search_simple() {
let proteins = get_example_proteins();
let sa = vec![
19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18,
];

let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap();
let taxonomy_file = create_taxonomy_file(&tmp_dir);

let searcher = Searcher::new(
sa,
1,
Box::new(SparseSuffixToProtein::new(&proteins.input_string)),
proteins,
TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(),
FunctionAggregator {}
);

// search bounds 'A'
let bounds_res = searcher.search_bounds(&[b'A']);
assert_eq!(bounds_res, BoundSearchResult::SearchResult((4, 9)));

// search bounds '$'
let bounds_res = searcher.search_bounds(&[b'$']);
assert_eq!(bounds_res, BoundSearchResult::SearchResult((0, 1)));

// search bounds 'AC'
let bounds_res = searcher.search_bounds(&[b'A', b'C']);
assert_eq!(bounds_res, BoundSearchResult::SearchResult((6, 8)));
}

#[test]
fn test_search_sparse() {
let proteins = get_example_proteins();
let sa = vec![9, 0, 3, 12, 15, 6, 18];

let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap();
let taxonomy_file = create_taxonomy_file(&tmp_dir);

let searcher = Searcher::new(
sa,
3,
Box::new(SparseSuffixToProtein::new(&proteins.input_string)),
proteins,
TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(),
FunctionAggregator {}
);

// search suffix 'VAA'
let found_suffixes =
searcher.search_matching_suffixes(&[b'V', b'A', b'A'], usize::MAX, false);
assert_eq!(
found_suffixes,
SearchAllSuffixesResult::SearchResult(vec![7])
);

// search suffix 'AC'
let found_suffixes = searcher.search_matching_suffixes(&[b'A', b'C'], usize::MAX, false);
assert_eq!(
found_suffixes,
SearchAllSuffixesResult::SearchResult(vec![5, 11])
);
}

#[test]
fn test_il_equality() {
let proteins = get_example_proteins();
let sa = vec![
19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18,
];

let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap();
let taxonomy_file = create_taxonomy_file(&tmp_dir);

let searcher = Searcher::new(
sa,
1,
Box::new(SparseSuffixToProtein::new(&proteins.input_string)),
proteins,
TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(),
FunctionAggregator {}
);

let bounds_res = searcher.search_bounds(&[b'I']);
assert_eq!(bounds_res, BoundSearchResult::SearchResult((13, 16)));

// search bounds 'RIZ' with equal I and L
let bounds_res = searcher.search_bounds(&[b'R', b'I', b'Z']);
assert_eq!(bounds_res, BoundSearchResult::SearchResult((17, 18)));
}

#[test]
fn test_il_equality_sparse() {
let proteins = get_example_proteins();
let sa = vec![9, 0, 3, 12, 15, 6, 18];

let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap();
let taxonomy_file = create_taxonomy_file(&tmp_dir);

let searcher = Searcher::new(
sa,
3,
Box::new(SparseSuffixToProtein::new(&proteins.input_string)),
proteins,
TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(),
FunctionAggregator {}
);

// search bounds 'RIZ' with equal I and L
let found_suffixes =
searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, true);
assert_eq!(
found_suffixes,
SearchAllSuffixesResult::SearchResult(vec![16])
);

// search bounds 'RIZ' without equal I and L
let found_suffixes =
searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, false);
assert_eq!(found_suffixes, SearchAllSuffixesResult::NoMatches);
}

// test edge case where an I or L is the first index in the sparse SA.
#[test]
fn test_l_first_index_in_sa() {
let text = "LMOXZ$".to_string().into_bytes();

let proteins = Proteins {
input_string: text,
proteins: vec![Protein {
uniprot_id: String::new(),
taxon_id: 0,
functional_annotations: vec![],
}],
};

let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap();
let taxonomy_file = create_taxonomy_file(&tmp_dir);

let sparse_sa = vec![0, 2, 4];
let searcher = Searcher::new(
sparse_sa,
2,
Box::new(SparseSuffixToProtein::new(&proteins.input_string)),
proteins,
TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(),
FunctionAggregator {}
);

// search bounds 'IM' with equal I and L
let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'M'], usize::MAX, true);
assert_eq!(
found_suffixes,
SearchAllSuffixesResult::SearchResult(vec![0])
);
}

#[test]
fn test_il_missing_matches() {
let text = "AAILLL$".to_string().into_bytes();

let proteins = Proteins {
input_string: text,
proteins: vec![Protein {
uniprot_id: String::new(),
taxon_id: 0,
functional_annotations: vec![],
}],
};

let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap();
let taxonomy_file = create_taxonomy_file(&tmp_dir);

let sparse_sa = vec![6, 0, 1, 5, 4, 3, 2];
let searcher = Searcher::new(
sparse_sa,
1,
Box::new(SparseSuffixToProtein::new(&proteins.input_string)),
proteins,
TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(),
FunctionAggregator {}
);

let found_suffixes = searcher.search_matching_suffixes(&[b'I'], usize::MAX, true);
assert_eq!(
found_suffixes,
SearchAllSuffixesResult::SearchResult(vec![2, 3, 4, 5])
);
}

#[test]
fn test_il_duplication() {
let text = "IIIILL$".to_string().into_bytes();

let proteins = Proteins {
input_string: text,
proteins: vec![Protein {
uniprot_id: String::new(),
taxon_id: 0,
functional_annotations: vec![],
}],
};

let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap();
let taxonomy_file = create_taxonomy_file(&tmp_dir);

let sparse_sa = vec![6, 5, 4, 3, 2, 1, 0];
let searcher = Searcher::new(
sparse_sa,
1,
Box::new(SparseSuffixToProtein::new(&proteins.input_string)),
proteins,
TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(),
FunctionAggregator {}
);

let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true);
assert_eq!(
found_suffixes,
SearchAllSuffixesResult::SearchResult(vec![0, 1, 2, 3, 4])
);
}

#[test]
fn test_il_suffix_check() {
let text = "IIIILL$".to_string().into_bytes();

let proteins = Proteins {
input_string: text,
proteins: vec![Protein {
uniprot_id: String::new(),
taxon_id: 0,
functional_annotations: vec![],
}],
};

let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap();
let taxonomy_file = create_taxonomy_file(&tmp_dir);

let sparse_sa = vec![6, 4, 2, 0];
let searcher = Searcher::new(
sparse_sa,
2,
Box::new(SparseSuffixToProtein::new(&proteins.input_string)),
proteins,
TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(),
FunctionAggregator {}
);

// search all places where II is in the string IIIILL, but with a sparse SA
// this way we check if filtering the suffixes works as expected
let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, false);
assert_eq!(
found_suffixes,
SearchAllSuffixesResult::SearchResult(vec![0, 1, 2])
);
}

#[test]
fn test_il_duplication2() {
let text = "IILLLL$".to_string().into_bytes();

let proteins = Proteins {
input_string: text,
proteins: vec![Protein {
uniprot_id: String::new(),
taxon_id: 0,
functional_annotations: vec![],
}],
};

let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap();
let taxonomy_file = create_taxonomy_file(&tmp_dir);

let sparse_sa = vec![6, 5, 4, 3, 2, 1, 0];
let searcher = Searcher::new(
sparse_sa,
1,
Box::new(SparseSuffixToProtein::new(&proteins.input_string)),
proteins,
TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(),
FunctionAggregator {}
);

// search bounds 'IM' with equal I and L
let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true);
assert_eq!(
found_suffixes,
SearchAllSuffixesResult::SearchResult(vec![0, 1, 2, 3, 4])
);
}
}
2 changes: 1 addition & 1 deletion sa-mappings/src/proteins.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ pub struct Proteins {
pub input_string: Vec<u8>,

/// The proteins in the input string
proteins: Vec<Protein>
pub proteins: Vec<Protein>
}

impl Protein {
Expand Down

0 comments on commit bf4dba0

Please sign in to comment.