Skip to content

Commit

Permalink
Added general open identified peptides file function
Browse files Browse the repository at this point in the history
  • Loading branch information
douweschulte committed Sep 10, 2024
1 parent ee05d0e commit 3e8c393
Show file tree
Hide file tree
Showing 13 changed files with 153 additions and 11 deletions.
1 change: 1 addition & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@
"Psicose",
"Psimod",
"psms",
"psmtsv",
"pyclass",
"pymethods",
"pymodule",
Expand Down
2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ members = [
"examples/*",
]
resolver = "2"
package.edition = "2021"
package.version = "0.9.0-alpha.1"

[profile.release]
debug = true
Expand Down
12 changes: 12 additions & 0 deletions examples/de-novo-align/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[package]
name = "de-novo-align"
version = "0.1.0"
publish = false
edition.workspace = true

[dependencies]
rustyms = { path = "../../rustyms" }
clap = { workspace = true, features = ["derive", "cargo"] }
itertools = { workspace = true }
rayon = { workspace = true }
serde_json = { workspace = true }
36 changes: 36 additions & 0 deletions examples/de-novo-align/src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
use std::{
fs::File,
io::{BufReader, BufWriter},
};

use clap::Parser;
use fragment::FragmentType;
use identification::{open_identified_peptides_file, FastaData};
use itertools::Itertools;
use rayon::prelude::*;
use rustyms::{
spectrum::{Score, Scores},
system::{e, usize::Charge},
*,
};
use spectrum::PeakSpectrum;
use std::collections::HashMap;

#[derive(Parser)]
struct Cli {
/// The input identified peptides file
#[arg(short, long)]
peptides: String,
/// The fasta database of known proteins
#[arg(short, long)]
database: String,
/// Where to store the results
#[arg(long)]
out_path: String,
}

fn main() {
let args = Cli::parse();
let peptides = open_identified_peptides_file(args.peptides, None).unwrap();
let database = FastaData::parse_file(args.database).unwrap();
}
2 changes: 1 addition & 1 deletion examples/multi-annotator/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "multi-annotator"
version = "0.1.0"
edition = "2021"
edition.workspace = true
publish = false

[dependencies]
Expand Down
2 changes: 1 addition & 1 deletion fuzz/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
name = "rustyms-fuzz"
version = "0.0.0"
publish = false
edition = "2021"
edition.workspace = true

[package.metadata]
cargo-fuzz = true
Expand Down
2 changes: 1 addition & 1 deletion rustyms-imgt-generate/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "rustyms-imgt-generate"
version = "0.1.0"
edition = "2021"
edition.workspace = true
license = "MIT OR Apache-2.0"
authors = ["Douwe Schulte <[email protected]>"]
rust-version = "1.70.0"
Expand Down
4 changes: 2 additions & 2 deletions rustyms-py/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "rustyms-py"
version = "0.9.0-alpha.1"
edition = "2021"
version.workspace = true
edition.workspace = true

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[lib]
Expand Down
4 changes: 2 additions & 2 deletions rustyms/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "rustyms"
version = "0.9.0-alpha.1"
edition = "2021"
version.workspace = true
edition.workspace = true
license = "MIT OR Apache-2.0"
authors = ["Douwe Schulte <[email protected]>"]
description = "A library to handle proteomic mass spectrometry data and match peptides to spectra."
Expand Down
12 changes: 8 additions & 4 deletions rustyms/src/identification/fasta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@ use crate::{
CompoundPeptidoform, LinearPeptide, SequenceElement,
};
use serde::{Deserialize, Serialize};
use std::io::{BufRead, BufReader};
use std::{
io::{BufRead, BufReader},
path::Path,
};

use super::{IdentifiedPeptide, MetaData};

Expand All @@ -21,12 +24,13 @@ impl FastaData {
/// Parse a single fasta file
/// # Errors
/// A custom error when it is not a valid fasta file
pub fn parse_file(path: &str) -> Result<Vec<Self>, CustomError> {
pub fn parse_file(path: impl AsRef<Path>) -> Result<Vec<Self>, CustomError> {
let path = path.as_ref();
let file = std::fs::File::open(path).map_err(|_| {
CustomError::error(
"Failed reading fasta file",
"Error occurred while opening the file",
Context::show(path),
Context::show(path.to_string_lossy()),
)
})?;
let reader = BufReader::new(file);
Expand All @@ -39,7 +43,7 @@ impl FastaData {
CustomError::error(
"Failed reading fasta file",
format!("Error occurred while reading line {}", line_index + 1),
Context::show(path),
Context::show(path.to_string_lossy()),
)
})?;
#[allow(clippy::manual_strip)]
Expand Down
69 changes: 69 additions & 0 deletions rustyms/src/identification/general.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
use std::path::Path;

use super::{
error::{Context, CustomError},
ontologies::CustomDatabase,
FastaData, IdentifiedPeptide, IdentifiedPeptideIter, IdentifiedPeptideSource, MSFraggerData,
MaxQuantData, NovorData, OpairData, PeaksData, SageData,
};

/// Open the selected path and automatically determine the file type.
/// # Errors
/// It errors if the file type could not be determined or if opening the file errors.
pub fn open_identified_peptides_file<'a>(
path: impl AsRef<Path>,
custom_database: Option<&'a CustomDatabase>,
) -> Result<Box<dyn Iterator<Item = Result<IdentifiedPeptide, CustomError>> + 'a>, CustomError> {
let path = path.as_ref();
let actual_extension = path
.extension()
.map(|ex| {
(ex == "gz")
.then_some(path)
.and_then(|p| p.file_stem())
.and_then(|p| Path::new(p).extension())
.unwrap_or(ex)
})
.map(|ex| ex.to_string_lossy().to_lowercase());
match actual_extension.as_deref() {
Some("csv") => PeaksData::parse_file(path, custom_database)
.map(IdentifiedPeptideIter::into_box)
.or_else(|_| {
NovorData::parse_file(path, custom_database).map(IdentifiedPeptideIter::into_box)
})
.map_err(|_| {
CustomError::error(
"Unknown file",
"Could not be recognised as either a Peaks or Novor file",
Context::show(path.to_string_lossy()),
)
}),
Some("tsv") => MSFraggerData::parse_file(path, custom_database)
.map(IdentifiedPeptideIter::into_box)
.or_else(|_| {
SageData::parse_file(path, custom_database).map(IdentifiedPeptideIter::into_box)
})
.map_err(|_| {
CustomError::error(
"Unknown file",
"Could not be recognised as either a MSFragger or Sage file",
Context::show(path.to_string_lossy()),
)
}),
Some("psmtsv") => {
OpairData::parse_file(path, custom_database).map(IdentifiedPeptideIter::into_box)
}
Some("fasta") => FastaData::parse_file(path).map(|peptides| {
Box::new(peptides.into_iter().map(|p| Ok(p.into())))
as Box<dyn Iterator<Item = Result<IdentifiedPeptide, CustomError>> + 'a>
}),
Some("txt") => {
MaxQuantData::parse_file(path, custom_database).map(IdentifiedPeptideIter::into_box)
}
_ => Err(CustomError::error(
"Unknown extension",
"Use CSV, TSV, TXT, PSMTSV, or Fasta, or any of these as a gzipped file (eg csv.gz).",
Context::show(path.to_string_lossy()),
)),
}
}
16 changes: 16 additions & 0 deletions rustyms/src/identification/identified_peptide.rs
Original file line number Diff line number Diff line change
Expand Up @@ -237,3 +237,19 @@ where
}
}
}

impl<'lifetime, R, I> IdentifiedPeptideIter<'lifetime, R, I>
where
R: IdentifiedPeptideSource + Into<IdentifiedPeptide> + 'lifetime,
I: Iterator<Item = Result<R::Source, CustomError>> + 'lifetime,
R::Format: 'static,
{
pub(super) fn into_box(
self,
) -> Box<dyn Iterator<Item = Result<IdentifiedPeptide, CustomError>> + 'lifetime> {
Box::new(self.map(|p: Result<R, CustomError>| match p {
Ok(p) => Ok(p.into()),
Err(e) => Err(e),
}))
}
}
2 changes: 2 additions & 0 deletions rustyms/src/identification/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
mod common_parser;

mod fasta;
mod general;
mod helper_functions;
mod identified_peptide;
mod maxquant;
Expand All @@ -15,6 +16,7 @@ mod sage;

use crate::*;
pub use fasta::*;
pub use general::*;
pub use identified_peptide::*;
pub use maxquant::*;
pub use msfragger::*;
Expand Down

0 comments on commit 3e8c393

Please sign in to comment.