Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimise initial bounds for maching peptides in Suffix Array #25

Draft
wants to merge 26 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
b95b9f9
first bad implementation for a kmer lookup cache
tibvdm Aug 26, 2024
4b522b7
Fix indexing issues
tibvdm Aug 26, 2024
fc8551e
fix empty string and add TODO for IL equality
tibvdm Aug 26, 2024
1047e43
Set K to 5 for testing purposes
tibvdm Aug 26, 2024
252d9c8
Don't search for peptides where the k-mer has no matches
tibvdm Aug 26, 2024
6d8c54e
Try to get more hits
tibvdm Aug 26, 2024
f7c66a2
Do more lookups, but try to avoid the largests bounds this way
tibvdm Aug 26, 2024
6aab11b
Don't search te entire search string again
tibvdm Aug 26, 2024
32c8a83
Move while loop to the comment
tibvdm Aug 26, 2024
deccd8a
remove shady lcp hack
tibvdm Aug 26, 2024
53c9490
Optimize the index functions
tibvdm Aug 26, 2024
54c5ef4
Do not store parameter K as a constant, but a field.
tibvdm Aug 27, 2024
cdb434d
offset array for efficiency + renaming bounds_cache file
tibvdm Aug 27, 2024
031a609
set L to I
tibvdm Aug 27, 2024
b6ff10d
fix tests
tibvdm Aug 27, 2024
a26ce06
add print information to track filling the cache
tibvdm Aug 27, 2024
2d0b2a4
add timings to track filling the cache
tibvdm Aug 27, 2024
ed2d746
min bound
tibvdm Aug 27, 2024
bfc358f
add `-` to the alphabet
tibvdm Aug 28, 2024
428c63b
remove equate
tibvdm Aug 28, 2024
3db59a0
division by 0 fix
tibvdm Aug 28, 2024
a1c3cdd
division by 0 fix
tibvdm Aug 28, 2024
c3000ff
division by 0 fix
tibvdm Aug 28, 2024
99cc2a5
fix the amount of iterations to calculate the k-mers
tibvdm Aug 28, 2024
e87e0cf
debug bounds
tibvdm Aug 28, 2024
352bcb7
fix the print code
tibvdm Aug 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 125 additions & 0 deletions sa-index/src/bounds_cache.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
pub struct BoundsCache {
pub bounds: Vec<Option<(usize, usize)>>,
pub base: usize,
pub k: usize,

ascii_array: [usize; 128],
powers_array: [usize; 10],
offsets_array: [usize; 10],
alphabet: Vec<u8>
}

impl BoundsCache {
pub fn new(alphabet: String, k: usize) -> BoundsCache {
assert!(k < 10, "K must be less than 10");

let alphabet = alphabet.to_uppercase().as_bytes().to_vec();
let base = alphabet.len();

let mut ascii_array: [usize; 128] = [0; 128];
for (i, byte) in alphabet.iter().enumerate() {
ascii_array[*byte as usize] = i;
}
//ascii_array[b'L' as usize] = ascii_array[b'I' as usize]; // I and L are treated as the same amino acid

let mut powers_array = [0; 10];
for i in 0..10 {
powers_array[i] = base.pow(i as u32);
}

let mut offsets_array = [0; 10];
for i in 2..10 {
offsets_array[i] = offsets_array[i - 1] + powers_array[i - 1];
}

// 20^1 + 20^2 + 20^3 + ... + 20^(K) = (20^(K + 1) - 20) / 19
let capacity = (base.pow(k as u32 + 1) - base) / (base - 1);

Self {
bounds: vec![None; capacity],
ascii_array,
powers_array,
offsets_array,
alphabet,
base,
k
}
}

pub fn get_kmer(&self, kmer: &[u8]) -> Option<(usize, usize)> {
self.bounds.get(self.kmer_to_index(kmer)).cloned()?
}

pub fn update_kmer(&mut self, kmer: &[u8], bounds: (usize, usize)) {
let index = self.kmer_to_index(kmer);
self.bounds[index] = Some(bounds);
}

pub fn index_to_kmer(&self, mut index: usize) -> Vec<u8> {
if index < self.base {
return vec![self.alphabet[index]];
}

// Calculate the length of the kmer
let mut length = 2;
while self.offsets_array[length + 1] <= index {
length += 1;
}

// Calculate the offset of the kmer
let offset = self.offsets_array[length];

// Translate the index to be inside the bounds [0, 20^k)
index -= offset;

// Basic conversion from base 10 to base `length`
let mut kmer = vec![0; length];
for i in 0..length {
kmer[length - i - 1] = self.alphabet[index % self.base];
index /= self.base;
}

kmer
}

fn kmer_to_index(&self, kmer: &[u8]) -> usize {
if kmer.len() == 1 {
return self.ascii_array[kmer[0] as usize];
}

let mut result = 0;
for i in 0..kmer.len() {
result += (self.ascii_array[kmer[i] as usize] + 1) * self.powers_array[kmer.len() - i - 1];
}

result - 1
}
}

#[cfg(test)]
mod tests {
use std::str::from_utf8;
use super::*;

#[test]
fn test_bounds_cache() {
let kmer_cache = BoundsCache::new("ACDEFGHIKLMNPQRSTVWY".to_string(), 5);

for i in 0..40 {
let kmer = kmer_cache.index_to_kmer(i);
eprintln!("{} -> {:?} -> {:?}", i, from_utf8(&kmer).unwrap(), kmer_cache.kmer_to_index(&kmer));
}

for i in 0..20_usize.pow(5) {
let kmer = kmer_cache.index_to_kmer(i);

if kmer.contains(&b'L') {
let equated_kmer = kmer.iter().map(|&c| if c == b'L' { b'I' } else { c }).collect::<Vec<u8>>();
assert_eq!(kmer_cache.kmer_to_index(&kmer), kmer_cache.kmer_to_index(&equated_kmer));
continue;
}

assert_eq!(kmer_cache.kmer_to_index(&kmer), i);
}
}
}
1 change: 1 addition & 0 deletions sa-index/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ pub mod binary;
pub mod peptide_search;
pub mod sa_searcher;
pub mod suffix_to_protein_index;
mod bounds_cache;

/// Represents a suffix array.
pub enum SuffixArray {
Expand Down
109 changes: 83 additions & 26 deletions sa-index/src/sa_searcher.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
use std::{cmp::min, ops::Deref};

use std::{cmp::min, cmp::max, ops::Deref};
use std::time::{SystemTime, SystemTimeError, UNIX_EPOCH};
use sa_mappings::proteins::{Protein, Proteins};

use crate::{
sa_searcher::BoundSearch::{Maximum, Minimum},
suffix_to_protein_index::{DenseSuffixToProtein, SparseSuffixToProtein, SuffixToProteinIndex},
Nullable, SuffixArray
};
use crate::bounds_cache::BoundsCache;

/// Enum indicating if we are searching for the minimum, or maximum bound in the suffix array
#[derive(Clone, Copy, PartialEq)]
Expand Down Expand Up @@ -74,9 +75,9 @@ impl PartialEq for SearchAllSuffixesResult {
pub struct SparseSearcher(Searcher);

impl SparseSearcher {
pub fn new(sa: SuffixArray, proteins: Proteins) -> Self {
pub fn new(sa: SuffixArray, proteins: Proteins, k: usize) -> Self {
let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein), k);
Self(searcher)
}
}
Expand All @@ -92,9 +93,9 @@ impl Deref for SparseSearcher {
pub struct DenseSearcher(Searcher);

impl DenseSearcher {
pub fn new(sa: SuffixArray, proteins: Proteins) -> Self {
pub fn new(sa: SuffixArray, proteins: Proteins, k: usize) -> Self {
let suffix_index_to_protein = DenseSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein), k);
Self(searcher)
}
}
Expand All @@ -121,6 +122,7 @@ impl Deref for DenseSearcher {
/// the functional analysis provided by Unipept
pub struct Searcher {
pub sa: SuffixArray,
pub kmer_cache: BoundsCache,
pub proteins: Proteins,
pub suffix_index_to_protein: Box<dyn SuffixToProteinIndex>
}
Expand All @@ -142,8 +144,41 @@ impl Searcher {
/// # Returns
///
/// Returns a new Searcher object
pub fn new(sa: SuffixArray, proteins: Proteins, suffix_index_to_protein: Box<dyn SuffixToProteinIndex>) -> Self {
Self { sa, proteins, suffix_index_to_protein }
pub fn new(sa: SuffixArray, proteins: Proteins, suffix_index_to_protein: Box<dyn SuffixToProteinIndex>, k: usize) -> Self {
// Create a KTable with all possible 3-mers
let kmer_cache = BoundsCache::new("ACDEFGHIKLMNPQRSTVWY".to_string(), k);

// Create the Searcher object
let mut searcher = Self { sa, kmer_cache, proteins, suffix_index_to_protein };

// Update the bounds for all 3-mers in the KTable
let base = searcher.kmer_cache.base;
let length = (base.pow(k as u32 + 1) - base) / (base - 1);

let print_step_size = max(length / 100, searcher.kmer_cache.base);

eprintln!("Starting cache fill");
let start_cache_fill_time = get_time_ms().unwrap();

for i in 0..length {
if i % print_step_size == 0 {
eprintln!("Updating kmer cache: {}% ({} seconds since start)", i / print_step_size, (get_time_ms().unwrap() - start_cache_fill_time) / 1000.0);
}

let kmer = searcher.kmer_cache.index_to_kmer(i);

let bounds = searcher.search_bounds_no_cache(&kmer, (0, searcher.sa.len()), 0);

if let BoundSearchResult::SearchResult((min_bound, max_bound)) = bounds {
let min_bound = if min_bound == 0 { 0 } else { min_bound - 1 };
let max_bound = if max_bound == searcher.sa.len() { max_bound } else { max_bound + 1 };
searcher.kmer_cache.update_kmer(&kmer, (min_bound, max_bound));
}
}

eprintln!("Filled cache in {} seconds", (get_time_ms().unwrap() - start_cache_fill_time) / 1000.0);

searcher
}

/// Compares the `search_string` to the `suffix`
Expand Down Expand Up @@ -224,10 +259,9 @@ impl Searcher {
/// The first argument is true if a match was found
/// The second argument indicates the index of the minimum or maximum bound for the match
/// (depending on `bound`)
fn binary_search_bound(&self, bound: BoundSearch, search_string: &[u8]) -> (bool, usize) {
let mut left: usize = 0;
let mut right: usize = self.sa.len();
let mut lcp_left: usize = 0;
fn binary_search_bound(&self, bound: BoundSearch, search_string: &[u8], start_bounds: (usize, usize), lcp_left: usize) -> (bool, usize) {
let (mut left, mut right) = start_bounds;
let mut lcp_left: usize = lcp_left;
let mut lcp_right: usize = 0;
let mut found = false;

Expand Down Expand Up @@ -278,13 +312,31 @@ impl Searcher {
/// Returns the minimum and maximum bound of all matches in the suffix array, or `NoMatches` if
/// no matches were found
pub fn search_bounds(&self, search_string: &[u8]) -> BoundSearchResult {
let (found_min, min_bound) = self.binary_search_bound(Minimum, search_string);
// If the string is empty, we don't need to search as nothing can be matched
if search_string.is_empty() {
return BoundSearchResult::NoMatches;
}

// Do a quick lookup in the kmer cache
// Use the (up to) first 5 characters of the search string as the kmer
// If the kmer is found in the cache, use the bounds from the cache as start bounds
// to find the bounds of the entire string
let max_mer_length = min(self.kmer_cache.k, search_string.len());
if let Some(bounds) = self.kmer_cache.get_kmer(&search_string[..max_mer_length]) {
return self.search_bounds_no_cache(search_string, bounds, max_mer_length);
}

BoundSearchResult::NoMatches
}

pub fn search_bounds_no_cache(&self, search_string: &[u8], start_bounds: (usize, usize), lcp_left: usize) -> BoundSearchResult {
let (found_min, min_bound) = self.binary_search_bound(Minimum, search_string, start_bounds, lcp_left);

if !found_min {
return BoundSearchResult::NoMatches;
}

let (_, max_bound) = self.binary_search_bound(Maximum, search_string);
let (_, max_bound) = self.binary_search_bound(Maximum, search_string, start_bounds, lcp_left);

BoundSearchResult::SearchResult((min_bound, max_bound + 1))
}
Expand Down Expand Up @@ -325,7 +377,7 @@ impl Searcher {
let il_locations_current_suffix = &il_locations[il_locations_start..];
let current_search_string_prefix = &search_string[..skip];
let current_search_string_suffix = &search_string[skip..];
let search_bound_result = self.search_bounds(&search_string[skip..]);
let search_bound_result = self.search_bounds(current_search_string_suffix);
// if the shorter part is matched, see if what goes before the matched suffix matches
// the unmatched part of the prefix
if let BoundSearchResult::SearchResult((min_bound, max_bound)) = search_bound_result {
Expand Down Expand Up @@ -456,6 +508,10 @@ impl Searcher {
}
}

pub fn get_time_ms() -> Result<f64, SystemTimeError> {
Ok(SystemTime::now().duration_since(UNIX_EPOCH)?.as_nanos() as f64 * 1e-6)
}

#[cfg(test)]
mod tests {
use sa_mappings::proteins::{Protein, Proteins};
Expand Down Expand Up @@ -521,15 +577,16 @@ mod tests {
let sa = SuffixArray::Original(vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], 1);

let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein), 3);

// search bounds 'A'
let bounds_res = searcher.search_bounds(&[b'A']);
assert_eq!(bounds_res, BoundSearchResult::SearchResult((4, 9)));

// search bounds '$'
let bounds_res = searcher.search_bounds(&[b'$']);
assert_eq!(bounds_res, BoundSearchResult::SearchResult((0, 1)));
// TODO: do we want to keep this??
// let bounds_res = searcher.search_bounds(&[b'$']);
// assert_eq!(bounds_res, BoundSearchResult::SearchResult((0, 1)));

// search bounds 'AC'
let bounds_res = searcher.search_bounds(&[b'A', b'C']);
Expand All @@ -542,7 +599,7 @@ mod tests {
let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3);

let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein), 3);

// search suffix 'VAA'
let found_suffixes = searcher.search_matching_suffixes(&[b'V', b'A', b'A'], usize::MAX, false);
Expand All @@ -559,7 +616,7 @@ mod tests {
let sa = SuffixArray::Original(vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], 1);

let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein), 3);

let bounds_res = searcher.search_bounds(&[b'I']);
assert_eq!(bounds_res, BoundSearchResult::SearchResult((13, 16)));
Expand All @@ -575,7 +632,7 @@ mod tests {
let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3);

let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein), 3);

// search bounds 'RIZ' with equal I and L
let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, true);
Expand All @@ -602,7 +659,7 @@ mod tests {

let sparse_sa = SuffixArray::Original(vec![0, 2, 4], 2);
let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein), 3);

// search bounds 'IM' with equal I and L
let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'M'], usize::MAX, true);
Expand All @@ -624,7 +681,7 @@ mod tests {

let sparse_sa = SuffixArray::Original(vec![6, 0, 1, 5, 4, 3, 2], 1);
let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein), 3);

let found_suffixes = searcher.search_matching_suffixes(&[b'I'], usize::MAX, true);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![2, 3, 4, 5]));
Expand All @@ -645,7 +702,7 @@ mod tests {

let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1);
let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein), 3);

let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0, 1, 2, 3, 4]));
Expand All @@ -666,7 +723,7 @@ mod tests {

let sparse_sa = SuffixArray::Original(vec![6, 4, 2, 0], 2);
let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein), 3);

// search all places where II is in the string IIIILL, but with a sparse SA
// this way we check if filtering the suffixes works as expected
Expand All @@ -689,7 +746,7 @@ mod tests {

let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1);
let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein), 3);

// search bounds 'IM' with equal I and L
let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true);
Expand Down
Loading