diff --git a/msannika_fdr.py b/msannika_fdr.py index a0d3e4a..b3c9f04 100644 --- a/msannika_fdr.py +++ b/msannika_fdr.py @@ -6,10 +6,11 @@ # micha.birklbauer@gmail.com # version tracking -__version = "1.0.0" -__date = "2024-01-09" +__version = "1.1.0" +__date = "2024-01-24" # REQUIREMENTS +# pip install numpy # pip install pandas # pip install openpyxl @@ -41,6 +42,7 @@ ###################### import argparse +import numpy as np import pandas as pd from typing import List @@ -138,20 +140,17 @@ class MSAnnika_CSM_Validator: def get_class(row: pd.Series) -> str: return "Decoy" if "D" in row["Alpha T/D"] or "D" in row["Beta T/D"] else "Target" - @staticmethod - def get_fdr(data: pd.DataFrame, score: float) -> float: - - df = data[data["Combined Score"] > score].copy() - df["Class"] = df.apply(lambda row: MSAnnika_CSM_Validator.get_class(row), axis = 1) - - return df[df["Class"] == "Decoy"].shape[0] / df[df["Class"] == "Target"].shape[0] - @staticmethod def get_cutoff(data: pd.DataFrame, fdr: float) -> float: + data["Class"] = data.apply(lambda row: MSAnnika_CSM_Validator.get_class(row), axis = 1) + data["Class_label"] = data.apply(lambda row: 0 if row["Class"] == "Target" else 1, axis = 1) + labels = data["Class_label"].to_numpy() + labels_sorted = labels[data["Combined Score"].to_numpy().argsort()] + scores = sorted(data["Combined Score"].tolist()) - for score in scores: - if MSAnnika_CSM_Validator.get_fdr(data, score) < fdr: + for i, score in enumerate(scores): + if labels_sorted[i:].sum() / (labels_sorted[i:].shape[0] - labels_sorted[i:].sum()) < fdr: return score return scores[0] @@ -160,7 +159,7 @@ def get_cutoff(data: pd.DataFrame, fdr: float) -> float: def validate(data: pd.DataFrame, fdr: float) -> pd.DataFrame: cutoff = MSAnnika_CSM_Validator.get_cutoff(data, fdr) - df = data[data["Combined Score"] > cutoff].copy() + df = data[data["Combined Score"] >= cutoff].copy() if "Confidence" not in df.columns: return df @@ -175,20 +174,17 @@ class MSAnnika_Crosslink_Validator: def get_class(row: pd.Series) -> str: return "Decoy" if row["Decoy"] else "Target" - @staticmethod - def get_fdr(data: pd.DataFrame, score: float) -> float: - - df = data[data["Best CSM Score"] > score].copy() - df["Class"] = df.apply(lambda row: MSAnnika_Crosslink_Validator.get_class(row), axis = 1) - - return df[df["Class"] == "Decoy"].shape[0] / df[df["Class"] == "Target"].shape[0] - @staticmethod def get_cutoff(data: pd.DataFrame, fdr: float) -> float: + data["Class"] = data.apply(lambda row: MSAnnika_Crosslink_Validator.get_class(row), axis = 1) + data["Class_label"] = data.apply(lambda row: 0 if row["Class"] == "Target" else 1, axis = 1) + labels = data["Class_label"].to_numpy() + labels_sorted = labels[data["Best CSM Score"].to_numpy().argsort()] + scores = sorted(data["Best CSM Score"].tolist()) - for score in scores: - if MSAnnika_Crosslink_Validator.get_fdr(data, score) < fdr: + for i, score in enumerate(scores): + if labels_sorted[i:].sum() / (labels_sorted[i:].shape[0] - labels_sorted[i:].sum()) < fdr: return score return scores[0] @@ -197,7 +193,7 @@ def get_cutoff(data: pd.DataFrame, fdr: float) -> float: def validate(data: pd.DataFrame, fdr: float) -> pd.DataFrame: cutoff = MSAnnika_Crosslink_Validator.get_cutoff(data, fdr) - df = data[data["Best CSM Score"] > cutoff].copy() + df = data[data["Best CSM Score"] >= cutoff].copy() df["Confidence"] = df.apply(lambda row: "High", axis = 1) return df