Skip to content

Commit

Permalink
add functionality to search only tryptic matches
Browse files Browse the repository at this point in the history
  • Loading branch information
SimonVandeVyver committed Sep 13, 2024
1 parent f15e3f8 commit 9e89156
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 34 deletions.
22 changes: 16 additions & 6 deletions sa-index/src/peptide_search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ impl From<&Protein> for ProteinInfo {
/// * `equate_il` - Boolean indicating if we want to equate I and L during search
/// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the
/// taxonomy
/// * `tryptic` - Boolean indicating if we only want tryptic matches.
///
/// # Returns
///
Expand All @@ -50,7 +51,8 @@ pub fn search_proteins_for_peptide<'a>(
searcher: &'a Searcher,
peptide: &str,
cutoff: usize,
equate_il: bool
equate_il: bool,
tryptic: bool
) -> Option<(bool, Vec<&'a Protein>)> {
let peptide = peptide.trim_end().to_uppercase();

Expand All @@ -59,7 +61,7 @@ pub fn search_proteins_for_peptide<'a>(
return None;
}

let suffix_search = searcher.search_matching_suffixes(peptide.as_bytes(), cutoff, equate_il);
let suffix_search = searcher.search_matching_suffixes(peptide.as_bytes(), cutoff, equate_il, tryptic);
let (suffixes, cutoff_used) = match suffix_search {
SearchAllSuffixesResult::MaxMatches(matched_suffixes) => Some((matched_suffixes, true)),
SearchAllSuffixesResult::SearchResult(matched_suffixes) => Some((matched_suffixes, false)),
Expand All @@ -71,8 +73,14 @@ pub fn search_proteins_for_peptide<'a>(
Some((cutoff_used, proteins))
}

pub fn search_peptide(searcher: &Searcher, peptide: &str, cutoff: usize, equate_il: bool) -> Option<SearchResult> {
let (cutoff_used, proteins) = search_proteins_for_peptide(searcher, peptide, cutoff, equate_il)?;
pub fn search_peptide(
searcher: &Searcher,
peptide: &str,
cutoff: usize,
equate_il: bool,
tryptic: bool
) -> Option<SearchResult> {
let (cutoff_used, proteins) = search_proteins_for_peptide(searcher, peptide, cutoff, equate_il, tryptic)?;

Some(SearchResult {
sequence: peptide.to_string(),
Expand All @@ -91,6 +99,7 @@ pub fn search_peptide(searcher: &Searcher, peptide: &str, cutoff: usize, equate_
/// * `equate_il` - Boolean indicating if we want to equate I and L during search
/// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the
/// taxonomy
/// * `tryptic` - Boolean indicating if we only want tryptic matches.
///
/// # Returns
///
Expand All @@ -99,11 +108,12 @@ pub fn search_all_peptides(
searcher: &Searcher,
peptides: &Vec<String>,
cutoff: usize,
equate_il: bool
equate_il: bool,
tryptic: bool
) -> Vec<SearchResult> {
peptides
.par_iter()
.filter_map(|peptide| search_peptide(searcher, peptide, cutoff, equate_il))
.filter_map(|peptide| search_peptide(searcher, peptide, cutoff, equate_il, tryptic))
.collect()
}

Expand Down
104 changes: 78 additions & 26 deletions sa-index/src/sa_searcher.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::{cmp::min, ops::Deref};

use sa_mappings::proteins::{Protein, Proteins};
use sa_mappings::proteins::{Protein, Proteins, SEPARATION_CHARACTER, TERMINATION_CHARACTER};

use crate::{
sa_searcher::BoundSearch::{Maximum, Minimum},
Expand Down Expand Up @@ -297,6 +297,7 @@ impl Searcher {
/// * `max_matches` - The maximum amount of matches processed, if more matches are found we
/// don't process them
/// * `equate_il` - True if we want to equate I and L during search, otherwise false
/// * `tryptic` - Boolean indicating if we only want tryptic matches.
///
/// # Returns
///
Expand All @@ -306,7 +307,8 @@ impl Searcher {
&self,
search_string: &[u8],
max_matches: usize,
equate_il: bool
equate_il: bool,
tryptic: bool
) -> SearchAllSuffixesResult {
let mut matching_suffixes: Vec<i64> = vec![];
let mut il_locations = vec![];
Expand Down Expand Up @@ -334,32 +336,41 @@ impl Searcher {
let mut sa_index = min_bound;
while sa_index < max_bound {
let suffix = self.sa.get(sa_index) as usize;
// filter away matches where I was wrongfully equalized to L, and check the
// unmatched prefix when I and L equalized, we only need to
// check the prefix, not the whole match, when the prefix is 0, we don't need to
// check at all
if suffix >= skip
&& ((skip == 0

if suffix >= skip {
let match_start = suffix - skip;
let match_end = suffix + search_string.len() - skip;

// filter away matches where I was wrongfully equalized to L, and check the
// unmatched prefix when I and L equalized, we only need to
// check the prefix, not the whole match, when the prefix is 0, we don't need to
// check at all
if (skip == 0
|| Self::check_prefix(
current_search_string_prefix,
&self.proteins.input_string[suffix - skip..suffix],
&self.proteins.input_string[match_start..suffix],
equate_il
))
&& Self::check_suffix(
skip,
il_locations_current_suffix,
current_search_string_suffix,
&self.proteins.input_string[suffix..suffix + search_string.len() - skip],
&self.proteins.input_string[suffix..match_end],
equate_il
))
{
matching_suffixes.push((suffix - skip) as i64);

// return if max number of matches is reached
if matching_suffixes.len() >= max_matches {
return SearchAllSuffixesResult::MaxMatches(matching_suffixes);
)
&& (!tryptic
|| ((self.check_start_of_protein(match_start) || self.check_tryptic_cut(match_start))
&& (self.check_end_of_protein(match_end) || self.check_tryptic_cut(match_end))))
{
matching_suffixes.push((suffix - skip) as i64);

// return if max number of matches is reached
if matching_suffixes.len() >= max_matches {
return SearchAllSuffixesResult::MaxMatches(matching_suffixes);
}
}
}

sa_index += 1;
}
}
Expand All @@ -373,6 +384,47 @@ impl Searcher {
}
}

/// Check if a cut is the start of a protein.
///
/// # Arguments
/// * `cut_index` - The index of the cut in the text of proteins.
///
/// # Returns
///
/// Returns true if the cut is at the start of a protein.
#[inline]
fn check_start_of_protein(&self, cut_index: usize) -> bool {
cut_index == 0 || self.proteins.input_string[cut_index - 1] == SEPARATION_CHARACTER
}

/// Check if a cut is the end of a protein.
///
/// # Arguments
/// * `cut_index` - The index of the cut in the text of proteins.
///
/// # Returns
///
/// Returns true if the cut is at the end of a protein.
#[inline]
fn check_end_of_protein(&self, cut_index: usize) -> bool {
self.proteins.input_string[cut_index] == TERMINATION_CHARACTER
|| self.proteins.input_string[cut_index] == SEPARATION_CHARACTER
}

/// Check if a cut is a tryptic cut, so check if the amino acid preceding the cut is K or R and the amino acid at the cut is not P.
///
/// # Arguments
/// * `cut_index` - The index of the cut in the text of proteins.
///
/// # Returns
///
/// Returns true if the cut is a tryptic cut.
#[inline]
fn check_tryptic_cut(&self, cut_index: usize) -> bool {
(self.proteins.input_string[cut_index - 1] == b'K' || self.proteins.input_string[cut_index - 1] == b'R')
&& self.proteins.input_string[cut_index] != b'P'
}

/// Returns true of the prefixes are the same
/// if `equate_il` is set to true, L and I are considered the same
///
Expand Down Expand Up @@ -545,11 +597,11 @@ mod tests {
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));

// search suffix 'VAA'
let found_suffixes = searcher.search_matching_suffixes(&[b'V', b'A', b'A'], usize::MAX, false);
let found_suffixes = searcher.search_matching_suffixes(&[b'V', b'A', b'A'], usize::MAX, false, false);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![7]));

// search suffix 'AC'
let found_suffixes = searcher.search_matching_suffixes(&[b'A', b'C'], usize::MAX, false);
let found_suffixes = searcher.search_matching_suffixes(&[b'A', b'C'], usize::MAX, false, false);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![5, 11]));
}

Expand Down Expand Up @@ -578,11 +630,11 @@ mod tests {
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));

// search bounds 'RIZ' with equal I and L
let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, true);
let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, true, false);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![16]));

// search bounds 'RIZ' without equal I and L
let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, false);
let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, false, false);
assert_eq!(found_suffixes, SearchAllSuffixesResult::NoMatches);
}

Expand All @@ -605,7 +657,7 @@ mod tests {
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));

// search bounds 'IM' with equal I and L
let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'M'], usize::MAX, true);
let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'M'], usize::MAX, true, false);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0]));
}

Expand All @@ -626,7 +678,7 @@ mod tests {
let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));

let found_suffixes = searcher.search_matching_suffixes(&[b'I'], usize::MAX, true);
let found_suffixes = searcher.search_matching_suffixes(&[b'I'], usize::MAX, true, false);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![2, 3, 4, 5]));
}

Expand All @@ -647,7 +699,7 @@ mod tests {
let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));

let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true);
let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true, false);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0, 1, 2, 3, 4]));
}

Expand All @@ -670,7 +722,7 @@ mod tests {

// search all places where II is in the string IIIILL, but with a sparse SA
// this way we check if filtering the suffixes works as expected
let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, false);
let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, false, false);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0, 1, 2]));
}

Expand All @@ -692,7 +744,7 @@ mod tests {
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));

// search bounds 'IM' with equal I and L
let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true);
let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true, false);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0, 1, 2, 3, 4]));
}
}
6 changes: 4 additions & 2 deletions sa-server/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,9 @@ struct InputData {
cutoff: usize,
#[serde(default = "bool::default")]
// default value is false // TODO: maybe default should be true?
equate_il: bool
equate_il: bool,
#[serde(default = "bool::default")] // default false
tryptic: bool
}

#[tokio::main]
Expand All @@ -83,7 +85,7 @@ async fn search(
State(searcher): State<Arc<SparseSearcher>>,
data: Json<InputData>
) -> Result<Json<Vec<SearchResult>>, StatusCode> {
let search_result = search_all_peptides(&searcher, &data.peptides, data.cutoff, data.equate_il);
let search_result = search_all_peptides(&searcher, &data.peptides, data.cutoff, data.equate_il, data.tryptic);

Ok(Json(search_result))
}
Expand Down

0 comments on commit 9e89156

Please sign in to comment.