From 26bca34353f8cfc6e74d6433d58eff532a94d001 Mon Sep 17 00:00:00 2001 From: tibvdm Date: Fri, 23 Aug 2024 11:21:39 +0200 Subject: [PATCH] improve checks --- sa-index/src/sa_searcher.rs | 70 +++++++++++++++++++------------------ sa-mappings/src/proteins.rs | 2 ++ 2 files changed, 38 insertions(+), 34 deletions(-) diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index f23c7e9..34be8c0 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -1,5 +1,5 @@ use std::{cmp::min, ops::Deref}; -use sa_mappings::proteins::{Protein, Proteins, SEPARATION_CHARACTER, TERMINATION_CHARACTER}; +use sa_mappings::proteins::{Protein, Proteins, SEPARATION_CHARACTER}; use crate::{ sa_searcher::BoundSearch::{Maximum, Minimum}, @@ -341,40 +341,41 @@ impl Searcher { while sa_index < max_bound { let suffix = self.sa.get(sa_index) as usize; - // filter away matches where I was wrongfully equalized to L, and check the - // unmatched prefix when I and L equalized, we only need to - // check the prefix, not the whole match, when the prefix is 0, we don't need to - // check at all - if suffix >= skip - && ((skip == 0 - || Self::check_prefix( - current_search_string_prefix, - &self.proteins.input_string[suffix - skip..suffix], - equate_il - )) - && Self::check_suffix( - skip, - il_locations_current_suffix, - current_search_string_suffix, - &self.proteins.input_string[suffix..suffix + search_string.len() - skip], - equate_il - )) - { - if tryptic && ( - (suffix - skip == 0) - || self.proteins.input_string[suffix - skip - 1] == b'R' - || self.proteins.input_string[suffix - skip - 1] == b'K' - || self.proteins.input_string[suffix - skip - 1] == SEPARATION_CHARACTER - ) && ( - self.proteins.input_string[suffix - skip + search_string.len() + 1] != b'P' - || self.proteins.input_string[suffix - skip + search_string.len() + 1] == SEPARATION_CHARACTER - || self.proteins.input_string[suffix - skip + search_string.len() + 1] == TERMINATION_CHARACTER - ) - { - sa_index += 1; - continue; - } + // If the suffix is smaller than the skip factor, we can't check the prefix + let checkable_suffix = suffix >= skip; + + // Check for trypticity if a tryptic search is performed + let is_tryptic = tryptic && ( + self.proteins.input_string[suffix - skip - 1] == b'R' || + self.proteins.input_string[suffix - skip - 1] == b'K' || + self.proteins.input_string[suffix - skip - 1] == SEPARATION_CHARACTER + ) && self.proteins.input_string[suffix - skip + search_string.len()] != b'P'; + if !is_tryptic { + sa_index += 1; + continue; + } + + // If the skip factor is 0, the entire search string should match. + let completely_matched = checkable_suffix && skip == 0; + + // If the skip factor is not 0, the prefix should match the prefix of the search string. + let prefix_matched = completely_matched || Self::check_prefix( + current_search_string_prefix, + &self.proteins.input_string[suffix - skip..suffix], + equate_il + ); + + // If the prefix is matched, we can check the suffix. + let suffix_matched = prefix_matched && Self::check_suffix( + skip, + il_locations_current_suffix, + current_search_string_suffix, + &self.proteins.input_string[suffix..suffix + search_string.len() - skip], + equate_il + ); + + if suffix_matched { matching_suffixes.push((suffix - skip) as i64); // return if max number of matches is reached @@ -382,6 +383,7 @@ impl Searcher { return SearchAllSuffixesResult::MaxMatches(matching_suffixes); } } + sa_index += 1; } } diff --git a/sa-mappings/src/proteins.rs b/sa-mappings/src/proteins.rs index f2b24cc..a9d4071 100644 --- a/sa-mappings/src/proteins.rs +++ b/sa-mappings/src/proteins.rs @@ -65,6 +65,8 @@ impl Proteins { // because of the encoded functional annotations let mut lines = ByteLines::new(BufReader::new(file)); + input_string.push(SEPARATION_CHARACTER.into()); + while let Some(Ok(line)) = lines.next() { let mut fields = line.split(|b| *b == b'\t');