Skip to content

Commit

Permalink
Added de-novo-align example
Browse files Browse the repository at this point in the history
  • Loading branch information
douweschulte committed Sep 11, 2024
1 parent 3e8c393 commit 5c179ce
Show file tree
Hide file tree
Showing 7 changed files with 138 additions and 13 deletions.
1 change: 1 addition & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
In the subfolders are some example small programs/scripts that are written with rustyms. See the readme for the separate examples for more details.
1 change: 1 addition & 0 deletions examples/de-novo-align/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ clap = { workspace = true, features = ["derive", "cargo"] }
itertools = { workspace = true }
rayon = { workspace = true }
serde_json = { workspace = true }
ordered-float = { workspace = true }
8 changes: 8 additions & 0 deletions examples/de-novo-align/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# De Novo align

Usage:
```
cargo run --release --bin de-novo-align -- --peptides rustyms\data\200305_HER_test_04_DENOVO.csv.gz --database examples\de-novo-align\database.fasta --out-path out.csv
```

This aligns all peptides from a given identified peptides file, see rustyms for a list of all supported files, to a list of known proteins. It returns a CSV file with the best alignment for each de novo peptide. This can be used to look into how good the _de novo_ predictions actually are.
56 changes: 56 additions & 0 deletions examples/de-novo-align/database.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
>sp|P35900|K1C20_HUMAN Keratin, type I cytoskeletal 20 OS=Homo sapiens OX=9606 GN=KRT20 PE=1 SV=1
MDFSRRSFHRSLSSSLQAPVVSTVGMQRLGTTPSVYGGAGGRGIRISNSRHTVNYGSDLT
GGGDLFVGNEKMAMQNLNDRLASYLEKVRTLEQSNSKLEVQIKQWYETNAPRAGRDYSAY
YRQIEELRSQIKDAQLQNARCVLQIDNAKLAAEDFRLKYETERGIRLTVEADLQGLNKVF
DDLTLHKTDLEIQIEELNKDLALLKKEHQEEVDGLHKHLGNTVNVEVDAAPGLNLGVIMN
EMRQKYEVMAQKNLQEAKEQFERQTAVLQQQVTVNTEELKGTEVQLTELRRTSQSLEIEL
QSHLSMKESLEHTLEETKARYSSQLANLQSLLSSLEAQLMQIRSNMERQNNEYHILLDIK
TRLEQEIATYRRLLEGEDVKTTEYQLSTLEERDIKKTRKIKTVVQEVVDGKVVSSEVKEV
EENI
>sp|P00761|TRYP_PIG Trypsin OS=Sus scrofa OX=9823 PE=1 SV=1
FPTDDDDKIVGGYTCAANSIPYQVSLNSGSHFCGGSLINSQWVVSAAHCYKSRIQVRLGE
HNIDVLEGNEQFINAAKIITHPNFNGNTLDNDIMLIKLSSPATLNSRVATVSLPRSCAAA
GTECLISGWGNTKSSGSSYPSLLQCLKAPVLSDSSCKSSYPGQITGNMICVGFLEGGKDS
CQGDSGGPVVCNGQLQGIVSWGYGCAQKNKPGVYTKVCNYVNWIQQTIAAN
>sp|Q99895|CTRC_HUMAN Chymotrypsin-C OS=Homo sapiens OX=9606 GN=CTRC PE=1 SV=2
MLGITVLAALLACASSCGVPSFPPNLSARVVGGEDARPHSWPWQISLQYLKNDTWRHTCG
GTLIASNFVLTAAHCISNTRTYRVAVGKNNLEVEDEEGSLFVGVDTIHVHKRWNALLLRN
DIALIKLAEHVELSDTIQVACLPEKDSLLPKDYPCYVTGWGRLWTNGPIADKLQQGLQPV
VDHATCSRIDWWGFRVKKTMVCAGGDGVISACNGDSGGPLNCQLENGSWEVFGIVSFGSR
RGCNTRKKPVVYTRVSAYIDWINEKMQL
>sp|P00778|PRLA_LYSEN Alpha-lytic protease OS=Lysobacter enzymogenes OX=69 GN=alpha-LP PE=1 SV=3
MYVSNHRSRRVARVSVSCLVAALAAMSCGAALAADQVDPQLKFAMQRDLGIFPTQLPQYL
QTEKLARTQAAAIEREFGAQFAGSWIERNEDGSFKLVAATSGARKSSTLGGVEVRNVRYS
LKQLQSAMEQLDAGANARVKGVSKPLDGVQSWYVDPRSNAVVVKVDDGATEAGVDFVALS
GADSAQVRIESSPGKLQTTANIVGGIEYSINNASLCSVGFSVTRGATKGFVTAGHCGTVN
ATARIGGAVVGTFAARVFPGNDRAWVSLTSAQTLLPRVANGSSFVTVRGSTEAAVGAAVC
RSGRTTGYQCGTITAKNVTANYAEGAVRGLTQGNACMGRGDSGGSWITSAGQAQGVMSGG
NVQSNGNNCGIPASQRSSLFERLQPILSQYGLSLVTG
>sp|P00800|THER_BACTH Thermolysin OS=Bacillus thermoproteolyticus OX=1427 GN=npr PE=1 SV=3
MKMKMKLASFGLAAGLAAQVFLPYNALASTEHVTWNQQFQTPQFISGDLLKVNGTSPEEL
VYQYVEKNENKFKFHENAKDTLQLKEKKNDNLGFTFMRFQQTYKGIPVFGAVVTSHVKDG
TLTALSGTLIPNLDTKGSLKSGKKLSEKQARDIAEKDLVANVTKEVPEYEQGKDTEFVVY
VNGDEASLAYVVNLNFLTPEPGNWLYIIDAVDGKILNKFNQLDAAKPGDVKSITGTSTVG
VGRGVLGDQKNINTTYSTYYYLQDNTRGNGIFTYDAKYRTTLPGSLWADADNQFFASYDA
PAVDAHYYAGVTYDYYKNVHNRLSYDGNNAAIRSSVHYSQGYNNAFWNGSQMVYGDGDGQ
TFIPLSGGIDVVAHELTHAVTDYTAGLIYQNESGAINEAISDIFGTLVEFYANKNPDWEI
GEDVYTPGISGDSLRSMSDPAKYGDPDHYSKRYTGTQDNGGVHINSGIINKAAYLISQGG
THYGVSVVGIGRDKLGKIFYRALTQYLTPTSNFSQLRAAAVQSATDLYGSTSQEVASVKQ
AFDAVGVK
>sp|Q9R4J4|ASPN_PSEFR Peptidyl-Asp metalloendopeptidase (Fragment) OS=Pseudomonas fragi OX=296 PE=1 SV=2
ESNQGYVNSNVGIELARYETTNYTESGSFDTDLARFRGTSDSIHTSRNTYTAADCATGYY
SFAHEIGHLQSARDIATDSSTSPYAYGHGYRYEPATGWRTIMAYNCTRSCPRLNYWSNPN
ISYDIGPDNQRVLVNTKATIAAFR
>Herceptin
EVQLVESGGGLVQPGGSLRLSCAASGFNIKDTYIHWVRQAPGKGLEWVARIYPTNGYTRY
ADSVKGRFTISADTSKNTAYLQMNSLRAEDTAVYYCSRWGGDGFYAMDYWGQGTLVTVSS
ASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSS
GLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEPKSCDKTHTCPPCPAPELLGG
PSVFLFPPKPKDTLMISRTPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYN
STYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSREE
MTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRW
QQGNVFSCSVMHEALHNHYTQKSLSLSPGK
DIQMTQSPSSLSASVGDRVTITCRASQDVNTAVAWYQQKPGKAPKLLIYSASFLYSGVPS
RFSGSRSGTDFTLTISSLQPEDFATYYCQQHYTTPPTFGQGTKVEIKRTVAAPSVFIFPP
SDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLT
LSKADYEKHKVYACEVTHQGLSSPVTKSFNRGEC
74 changes: 63 additions & 11 deletions examples/de-novo-align/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,16 @@
use std::{
fs::File,
io::{BufReader, BufWriter},
};
use std::{collections::HashMap, fs::File, io::BufWriter};

use clap::Parser;
use fragment::FragmentType;
use identification::{open_identified_peptides_file, FastaData};
use itertools::Itertools;
use ordered_float::OrderedFloat;
use rayon::prelude::*;
use rustyms::{
spectrum::{Score, Scores},
system::{e, usize::Charge},
align::{align, matrix, AlignType},
csv::write_csv,
identification::{open_identified_peptides_file, FastaData},
system::da,
*,
};
use spectrum::PeakSpectrum;
use std::collections::HashMap;

#[derive(Parser)]
struct Cli {
Expand All @@ -31,6 +27,62 @@ struct Cli {

fn main() {
let args = Cli::parse();
let peptides = open_identified_peptides_file(args.peptides, None).unwrap();
let out_file = BufWriter::new(File::create(args.out_path).unwrap());
let peptides = open_identified_peptides_file(args.peptides, None)
.unwrap()
.filter_map(|p| p.ok())
.collect_vec();
let database = FastaData::parse_file(args.database).unwrap();

let alignments: Vec<_> = peptides
.par_iter()
.map(|peptide| {
database
.iter()
.map(|db| {
(
db,
peptide,
align::<4, SemiAmbiguous, SemiAmbiguous>(
&db.peptide,
peptide.metadata.peptide().unwrap(),
matrix::BLOSUM62,
Tolerance::Absolute(da(0.1)),
AlignType::EITHER_GLOBAL,
),
)
})
.max_by_key(|a| OrderedFloat(a.2.normalised_score()))
.unwrap()
})
.map(|(db, peptide, alignment)| {
HashMap::from([
("Peptide".to_string(), alignment.seq_b().to_string()),
(
"Rawfile".to_string(),
peptide
.metadata
.raw_file()
.map_or(String::new(), |p| p.to_string_lossy().to_string()),
),
(
"De novo score".to_string(),
peptide.score.map_or(String::new(), |s| s.to_string()),
),
("Protein".to_string(), db.id.clone()),
(
"Score".to_string(),
alignment.normalised_score().to_string(),
),
("Start".to_string(), alignment.start_a().to_string()),
(
"End".to_string(),
(alignment.start_a() + alignment.len_a()).to_string(),
),
("Path".to_string(), alignment.short()),
])
})
.collect();

write_csv(out_file, alignments).unwrap();
}
9 changes: 9 additions & 0 deletions examples/multi-annotator/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Multi annotator

Usage:
```
cargo run --release --bin multi-annotator -- --in-path .\CIDcurves_file_to_match.csv --out-path out.csv
```
Note: the examples files are not present.

This takes a CSV file as input that contains a peptide and the rawfile it originated from, it then annotates the spectrum with the theoretical fragmentation from rustyms and delivers some statistics on the annotation in a resulting CSV file. This can be used to get a global impression over a whole dataset, so for example see if a certain fragmentation energy increases or decreases the coverage of a particular ion series (peptide or glycan).
2 changes: 0 additions & 2 deletions examples/multi-annotator/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,6 @@ struct Cli {
}

fn main() {
//cargo run --release -- --in-path .\CIDcurves_file_to_match.csv --out-path out.csv
//let model = Model::none().b(Location::All, Vec::new()).y(Location::All, Vec::new()).glycan(Some(Vec::new()));
let model = Model::all();
let args = Cli::parse();
let path = ProjectDirs::from("com", "com.snijderlab.annotator", "")
Expand Down

0 comments on commit 5c179ce

Please sign in to comment.