From 15a6b9a7f0f229171cb3055f8099616fa7196d17 Mon Sep 17 00:00:00 2001 From: Micha Birklbauer Date: Wed, 24 Jan 2024 14:59:03 +0100 Subject: [PATCH 1/4] optimize FDR calculation --- msannika_fdr.py | 42 ++++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/msannika_fdr.py b/msannika_fdr.py index a0d3e4a..193b2c8 100644 --- a/msannika_fdr.py +++ b/msannika_fdr.py @@ -6,10 +6,11 @@ # micha.birklbauer@gmail.com # version tracking -__version = "1.0.0" -__date = "2024-01-09" +__version = "1.1.0" +__date = "2024-01-24" # REQUIREMENTS +# pip install numpy # pip install pandas # pip install openpyxl @@ -41,6 +42,7 @@ ###################### import argparse +import numpy as np import pandas as pd from typing import List @@ -138,20 +140,16 @@ class MSAnnika_CSM_Validator: def get_class(row: pd.Series) -> str: return "Decoy" if "D" in row["Alpha T/D"] or "D" in row["Beta T/D"] else "Target" - @staticmethod - def get_fdr(data: pd.DataFrame, score: float) -> float: - - df = data[data["Combined Score"] > score].copy() - df["Class"] = df.apply(lambda row: MSAnnika_CSM_Validator.get_class(row), axis = 1) - - return df[df["Class"] == "Decoy"].shape[0] / df[df["Class"] == "Target"].shape[0] - @staticmethod def get_cutoff(data: pd.DataFrame, fdr: float) -> float: - scores = sorted(data["Combined Score"].tolist()) - for score in scores: - if MSAnnika_CSM_Validator.get_fdr(data, score) < fdr: + data["Class"] = data.apply(lambda row: MSAnnika_CSM_Validator.get_class(row), axis = 1) + data["Class_label"] = data.apply(lambda row: 0 if row["Class"] == "Target" else 1, axis = 1) + labels = data["Class_label"].to_numpy() + labels_sorted = labels[data["Combined Score"].to_numpy().argsort()] + + for i, score in enumerate(sorted(data["Combined Score"].tolist())): + if labels_sorted[i:].sum() / (labels_sorted[i:].shape[0] - labels_sorted[i:].sum()) < fdr: return score return scores[0] @@ -175,20 +173,16 @@ class MSAnnika_Crosslink_Validator: def get_class(row: pd.Series) -> str: return "Decoy" if row["Decoy"] else "Target" - @staticmethod - def get_fdr(data: pd.DataFrame, score: float) -> float: - - df = data[data["Best CSM Score"] > score].copy() - df["Class"] = df.apply(lambda row: MSAnnika_Crosslink_Validator.get_class(row), axis = 1) - - return df[df["Class"] == "Decoy"].shape[0] / df[df["Class"] == "Target"].shape[0] - @staticmethod def get_cutoff(data: pd.DataFrame, fdr: float) -> float: - scores = sorted(data["Best CSM Score"].tolist()) - for score in scores: - if MSAnnika_Crosslink_Validator.get_fdr(data, score) < fdr: + data["Class"] = data.apply(lambda row: MSAnnika_CSM_Validator.get_class(row), axis = 1) + data["Class_label"] = data.apply(lambda row: 0 if row["Class"] == "Target" else 1, axis = 1) + labels = data["Class_label"].to_numpy() + labels_sorted = labels[data["Best CSM Score"].to_numpy().argsort()] + + for i, score in enumerate(sorted(data["Best CSM Score"].tolist())): + if labels_sorted[i:].sum() / (labels_sorted[i:].shape[0] - labels_sorted[i:].sum()) < fdr: return score return scores[0] From 3972e3683aea61b78c4a7e45df2edd4bafda56e9 Mon Sep 17 00:00:00 2001 From: Micha Birklbauer Date: Wed, 24 Jan 2024 15:02:10 +0100 Subject: [PATCH 2/4] fix undeclared var --- msannika_fdr.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/msannika_fdr.py b/msannika_fdr.py index 193b2c8..ed4f302 100644 --- a/msannika_fdr.py +++ b/msannika_fdr.py @@ -148,7 +148,8 @@ def get_cutoff(data: pd.DataFrame, fdr: float) -> float: labels = data["Class_label"].to_numpy() labels_sorted = labels[data["Combined Score"].to_numpy().argsort()] - for i, score in enumerate(sorted(data["Combined Score"].tolist())): + scores = sorted(data["Combined Score"].tolist()) + for i, score in enumerate(scores): if labels_sorted[i:].sum() / (labels_sorted[i:].shape[0] - labels_sorted[i:].sum()) < fdr: return score @@ -181,7 +182,8 @@ def get_cutoff(data: pd.DataFrame, fdr: float) -> float: labels = data["Class_label"].to_numpy() labels_sorted = labels[data["Best CSM Score"].to_numpy().argsort()] - for i, score in enumerate(sorted(data["Best CSM Score"].tolist())): + scores = sorted(data["Best CSM Score"].tolist()) + for i, score in enumerate(scores): if labels_sorted[i:].sum() / (labels_sorted[i:].shape[0] - labels_sorted[i:].sum()) < fdr: return score From 6bff9c3f8e486c875eed6a5afa0b62ff3b81211f Mon Sep 17 00:00:00 2001 From: Micha Birklbauer Date: Wed, 24 Jan 2024 15:04:51 +0100 Subject: [PATCH 3/4] use correct validator --- msannika_fdr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/msannika_fdr.py b/msannika_fdr.py index ed4f302..ce9aeff 100644 --- a/msannika_fdr.py +++ b/msannika_fdr.py @@ -177,7 +177,7 @@ def get_class(row: pd.Series) -> str: @staticmethod def get_cutoff(data: pd.DataFrame, fdr: float) -> float: - data["Class"] = data.apply(lambda row: MSAnnika_CSM_Validator.get_class(row), axis = 1) + data["Class"] = data.apply(lambda row: MSAnnika_Crosslink_Validator.get_class(row), axis = 1) data["Class_label"] = data.apply(lambda row: 0 if row["Class"] == "Target" else 1, axis = 1) labels = data["Class_label"].to_numpy() labels_sorted = labels[data["Best CSM Score"].to_numpy().argsort()] From 5f5871d1822f15e67f6f11b2914d5e69b61526ab Mon Sep 17 00:00:00 2001 From: Micha Birklbauer Date: Wed, 24 Jan 2024 15:11:54 +0100 Subject: [PATCH 4/4] fix cutoff comparison --- msannika_fdr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/msannika_fdr.py b/msannika_fdr.py index ce9aeff..b3c9f04 100644 --- a/msannika_fdr.py +++ b/msannika_fdr.py @@ -159,7 +159,7 @@ def get_cutoff(data: pd.DataFrame, fdr: float) -> float: def validate(data: pd.DataFrame, fdr: float) -> pd.DataFrame: cutoff = MSAnnika_CSM_Validator.get_cutoff(data, fdr) - df = data[data["Combined Score"] > cutoff].copy() + df = data[data["Combined Score"] >= cutoff].copy() if "Confidence" not in df.columns: return df @@ -193,7 +193,7 @@ def get_cutoff(data: pd.DataFrame, fdr: float) -> float: def validate(data: pd.DataFrame, fdr: float) -> pd.DataFrame: cutoff = MSAnnika_Crosslink_Validator.get_cutoff(data, fdr) - df = data[data["Best CSM Score"] > cutoff].copy() + df = data[data["Best CSM Score"] >= cutoff].copy() df["Confidence"] = df.apply(lambda row: "High", axis = 1) return df