Skip to content

Commit

Permalink
Merge pull request #389 from daichengxin/dev
Browse files Browse the repository at this point in the history
diann 1.8.1 -> 1.9.1dev
  • Loading branch information
ypriverol authored Jul 23, 2024
2 parents 94424c4 + 2e2f0aa commit 85f04b8
Show file tree
Hide file tree
Showing 12 changed files with 150 additions and 57 deletions.
19 changes: 16 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ jobs:
exec_profile: conda
- NXF_VER: "latest-everything"
exec_profile: "conda"
include:
- test_profile: test_latest_dia
exec_profile: singularity
steps:
- name: Check out pipeline code
uses: actions/checkout@v4
Expand All @@ -65,9 +68,11 @@ jobs:
echo "$(pwd)/micromamba/bin" >> $GITHUB_PATH
./bin/micromamba shell init -s bash
echo $'channels:\n - conda-forge\n - bioconda\n - defaults\nuse_lockfiles: false' >> ~/.mambarc
- name: Run pipeline with test data
if: matrix.exec_profile != 'conda'
- name: Install Singularity with defaults
if: matrix.exec_profile == 'singularity'
uses: singularityhub/install-singularity@main
- name: Run pipeline with test data in docker profile
if: matrix.exec_profile == 'docker'
# TODO nf-core: You can customise CI pipeline run tests as required
# For example: adding multiple test runs with different parameters
# Remember that you can parallelise this by using strategy.matrix
Expand All @@ -82,6 +87,14 @@ jobs:
# Remember that you can parallelise this by using strategy.matrix
run: |
nextflow run ${GITHUB_WORKSPACE} -profile $TEST_PROFILE,micromamba --outdir ${TEST_PROFILE}_${EXEC_PROFILE}_results
- name: Run pipeline with test data in singularity profile
if: matrix.exec_profile == 'singularity'
# TODO nf-core: You can customise CI pipeline run tests as required
# For example: adding multiple test runs with different parameters
# Remember that you can parallelise this by using strategy.matrix
run: |
nextflow run ${GITHUB_WORKSPACE} -profile $TEST_PROFILE,$EXEC_PROFILE --outdir ${TEST_PROFILE}_${EXEC_PROFILE}_results
- name: Gather failed logs
if: failure() || cancelled()
run: |
Expand Down
54 changes: 27 additions & 27 deletions bin/diann_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ def diann_version(self) -> str:
return diann_version_id

def validate_diann_version(self) -> None:
supported_diann_versions = ["1.8.1"]
supported_diann_versions = ["1.8.1", "1.9.beta.1"]
if self.diann_version not in supported_diann_versions:
raise ValueError(f"Unsupported DIANN version {self.diann_version}")

Expand Down Expand Up @@ -591,13 +591,13 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df):

logger.debug("Classifying results type ...")
pg["opt_global_result_type"] = "single_protein"
pg.loc[pg["Protein.Ids"].str.contains(";"), "opt_global_result_type"] = "indistinguishable_protein_group"
pg.loc[pg["Protein.Group"].str.contains(";"), "opt_global_result_type"] = "indistinguishable_protein_group"

out_mztab_PRH = pg
del pg
out_mztab_PRH = out_mztab_PRH.drop(["Protein.Names"], axis=1)
out_mztab_PRH.rename(
columns={"Protein.Group": "accession", "First.Protein.Description": "description"}, inplace=True
columns={"First.Protein.Description": "description"}, inplace=True
)
out_mztab_PRH.loc[:, "database"] = database

Expand All @@ -614,10 +614,10 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df):
out_mztab_PRH.loc[:, i] = "null"

logger.debug("Extracting accession values (keeping first)...")
out_mztab_PRH.loc[:, "accession"] = out_mztab_PRH.apply(lambda x: x["accession"].split(";")[0], axis=1)
out_mztab_PRH.loc[:, "accession"] = out_mztab_PRH.apply(lambda x: x["Protein.Group"].split(";")[0], axis=1)

protein_details_df = out_mztab_PRH[out_mztab_PRH["opt_global_result_type"] == "indistinguishable_protein_group"]
prh_series = protein_details_df["Protein.Ids"].str.split(";", expand=True).stack().reset_index(level=1, drop=True)
prh_series = protein_details_df["Protein.Group"].str.split(";", expand=True).stack().reset_index(level=1, drop=True)
prh_series.name = "accession"
protein_details_df = (
protein_details_df.drop("accession", axis=1).join(prh_series).reset_index().drop(columns="index")
Expand All @@ -644,14 +644,14 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df):
# out_mztab_PRH.loc[out_mztab_PRH["opt_global_result_type"] == "single_protein", "ambiguity_members"] = "null"
# or out_mztab_PRH.loc[out_mztab_PRH["Protein.Ids"] == out_mztab_PRH["accession"], "ambiguity_members"] = "null"
out_mztab_PRH.loc[:, "ambiguity_members"] = out_mztab_PRH.apply(
lambda x: x["Protein.Ids"] if x["opt_global_result_type"] == "indistinguishable_protein_group" else "null",
lambda x: x["Protein.Group"] if x["opt_global_result_type"] == "indistinguishable_protein_group" else "null",
axis=1,
)

logger.debug("Matching PRH to best search engine score...")
score_looker = ModScoreLooker(report)
out_mztab_PRH[["modifiedSequence", "best_search_engine_score[1]"]] = out_mztab_PRH.apply(
lambda x: score_looker.get_score(x["Protein.Ids"]), axis=1, result_type="expand"
lambda x: score_looker.get_score(x["Protein.Group"]), axis=1, result_type="expand"
)

logger.debug("Matching PRH to modifications...")
Expand All @@ -664,16 +664,16 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df):
# This used to be a bottleneck in performance
# This implementation drops the run time from 57s to 25ms
protein_agg_report = (
report[["PG.MaxLFQ", "Protein.Ids", "study_variable"]]
.groupby(["study_variable", "Protein.Ids"])
report[["PG.MaxLFQ", "Protein.Group", "study_variable"]]
.groupby(["study_variable", "Protein.Group"])
.agg({"PG.MaxLFQ": ["mean", "std", "sem"]})
.reset_index()
.pivot(columns=["study_variable"], index="Protein.Ids")
.pivot(columns=["study_variable"], index="Protein.Group")
.reset_index()
)
protein_agg_report.columns = ["::".join([str(s) for s in col]).strip() for col in protein_agg_report.columns.values]
subname_mapper = {
"Protein.Ids::::": "Protein.Ids",
"Protein.Group::::": "Protein.Group",
"PG.MaxLFQ::mean": "protein_abundance_study_variable",
"PG.MaxLFQ::std": "protein_abundance_stdev_study_variable",
"PG.MaxLFQ::sem": "protein_abundance_std_error_study_variable",
Expand All @@ -685,7 +685,7 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df):
# Oddly enough the last implementation mapped the the accession (Q9NZJ9) in the mztab
# to the Protein.Ids (A0A024RBG1;Q9NZJ9;Q9NZJ9-2), leading to A LOT of missing values.
out_mztab_PRH = out_mztab_PRH.merge(
protein_agg_report, on="Protein.Ids", how="left", validate="many_to_one", copy=True
protein_agg_report, on="Protein.Group", how="left", validate="many_to_one", copy=True
)
del name_mapper
del subname_mapper
Expand All @@ -694,7 +694,7 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df):

out_mztab_PRH.loc[:, "PRH"] = "PRT"
index = out_mztab_PRH.loc[:, "PRH"]
out_mztab_PRH.drop(["PRH", "Genes", "modifiedSequence", "Protein.Ids"], axis=1, inplace=True)
out_mztab_PRH.drop(["PRH", "Genes", "modifiedSequence", "Protein.Group"], axis=1, inplace=True)
out_mztab_PRH.insert(0, "PRH", index)
out_mztab_PRH.fillna("null", inplace=True)
out_mztab_PRH.loc[:, "database"] = database
Expand Down Expand Up @@ -734,12 +734,12 @@ def mztab_PEH(
out_mztab_PEH = pd.DataFrame()
out_mztab_PEH = pr.iloc[:, 0:10]
out_mztab_PEH.drop(
["Protein.Group", "Protein.Names", "First.Protein.Description", "Proteotypic"], axis=1, inplace=True
["Protein.Ids", "Protein.Names", "First.Protein.Description", "Proteotypic"], axis=1, inplace=True
)
out_mztab_PEH.rename(
columns={
"Stripped.Sequence": "sequence",
"Protein.Ids": "accession",
"Protein.Group": "accession",
"Modified.Sequence": "opt_global_cv_MS:1000889_peptidoform_sequence",
"Precursor.Charge": "charge",
},
Expand Down Expand Up @@ -909,7 +909,7 @@ def __find_info(directory, n):
out_mztab_PSH = out_mztab_PSH[
[
"Stripped.Sequence",
"Protein.Ids",
"Protein.Group",
"Q.Value",
"RT.Start",
"Precursor.Charge",
Expand Down Expand Up @@ -1014,7 +1014,7 @@ def classify_result_type(target):
:return: A string implys protein type
:rtype: str
"""
if ";" in target["Protein.Ids"]:
if ";" in target["Protein.Group"]:
return "indistinguishable_protein_group"
return "single_protein"

Expand Down Expand Up @@ -1056,7 +1056,7 @@ def match_in_report(report, target, max_, flag, level):
return tuple(q_value)

if flag == 1 and level == "protein":
result = report[report["Protein.Ids"] == target]
result = report[report["Protein.Group"] == target]
PRH_params = []
for i in range(1, max_ + 1):
match = result[result["study_variable"] == i]
Expand All @@ -1083,19 +1083,19 @@ def __init__(self, report: pd.DataFrame) -> None:

def make_lookup_dict(self, report) -> Dict[str, Tuple[str, float]]:
grouped_df = (
report[["Modified.Sequence", "Protein.Ids", "Global.PG.Q.Value"]]
report[["Modified.Sequence", "Protein.Group", "Global.PG.Q.Value"]]
.sort_values("Global.PG.Q.Value", ascending=True)
.groupby(["Protein.Ids"])
.groupby(["Protein.Group"])
.head(1)
)
# Modified.Sequence Protein.Ids Global.PG.Q.Value
# Modified.Sequence Protein.Group Global.PG.Q.Value
# 78265 LFNEQNFFQR Q8IV63;Q8IV63-2;Q8IV63-3 0.000252
# 103585 NPTIVNFPITNVDLR Q53GS9;Q53GS9-2 0.000252
# 103586 NPTWKPLIR Q7Z4Q2;Q7Z4Q2-2 0.000252
# 103588 NPVGYPLAWQFLR Q9NZ08;Q9NZ08-2 0.000252

out = {
row["Protein.Ids"]: (row["Modified.Sequence"], row["Global.PG.Q.Value"]) for _, row in grouped_df.iterrows()
row["Protein.Group"]: (row["Modified.Sequence"], row["Global.PG.Q.Value"]) for _, row in grouped_df.iterrows()
}
return out

Expand Down Expand Up @@ -1325,17 +1325,17 @@ def calculate_protein_coverages(report: pd.DataFrame, out_mztab_PRH: pd.DataFram
protein in the PRH table (defined by accession, not protein.ids).
"""
nested_df = (
report[["Protein.Ids", "Stripped.Sequence"]]
.groupby("Protein.Ids")
report[["Protein.Group", "Stripped.Sequence"]]
.groupby("Protein.Group")
.agg({"Stripped.Sequence": set})
.reset_index()
)
# Protein.Ids Stripped.Sequence
# Protein.Group Stripped.Sequence
# 0 A0A024RBG1;Q9NZJ9;Q9NZJ9-2 {SEQEDEVLLVSSSR}
# 1 A0A096LP49;A0A096LP49-2 {SPWAMTERKHSSLER}
# 2 A0AVT1;A0AVT1-2 {EDFTLLDFINAVK, KPDHVPISSEDER, QDVIITALDNVEAR,...
ids_to_seqs = dict(zip(nested_df["Protein.Ids"], nested_df["Stripped.Sequence"]))
acc_to_ids = dict(zip(out_mztab_PRH["accession"], out_mztab_PRH["Protein.Ids"]))
ids_to_seqs = dict(zip(nested_df["Protein.Group"], nested_df["Stripped.Sequence"]))
acc_to_ids = dict(zip(out_mztab_PRH["accession"], out_mztab_PRH["Protein.Group"]))
fasta_id_to_seqs = dict(zip(fasta_df["id"], fasta_df["seq"]))
acc_to_fasta_ids: dict = {}

Expand Down
6 changes: 3 additions & 3 deletions bin/psm_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

_parquet_field = [
"sequence", "protein_accessions", "protein_start_positions", "protein_end_positions",
"modifications", "retention_time", "charge", "calc_mass_to_charge", "reference_file_name",
"modifications", "retention_time", "charge", "exp_mass_to_charge", "reference_file_name",
"scan_number", "peptidoform", "posterior_error_probability", "global_qvalue", "is_decoy",
"consensus_support", "mz_array", "intensity_array", "num_peaks", "search_engines", "id_scores", "hit_rank"
]
Expand Down Expand Up @@ -61,7 +61,7 @@ def convert_psm(idxml, spectra_file, export_decoy_psm):

for peptide_id in pep_ids:
retention_time = peptide_id.getRT()
calc_mass_to_charge = peptide_id.getMZ()
exp_mass_to_charge = peptide_id.getMZ()
scan_number = int(re.findall(r"(spectrum|scan)=(\d+)", peptide_id.getMetaValue("spectrum_reference"))[0][1])

if isinstance(spectra_df, pd.DataFrame):
Expand Down Expand Up @@ -101,7 +101,7 @@ def convert_psm(idxml, spectra_file, export_decoy_psm):
hit_rank = hit.getRank()

parquet_data.append([sequence, protein_accessions, protein_start_positions, protein_end_positions,
modifications, retention_time, charge, calc_mass_to_charge, reference_file_name,
modifications, retention_time, charge, exp_mass_to_charge, reference_file_name,
scan_number, peptidoform, posterior_error_probability, global_qvalue, is_decoy,
consensus_support, mz_array, intensity_array, num_peaks, search_engines, id_scores,
hit_rank])
Expand Down
48 changes: 48 additions & 0 deletions conf/test_latest_dia.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests (DIA)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple test.
Use as follows:
nextflow run nf-core/quantms -profile test_dia,<docker/singularity> [--outdir <OUTDIR>]
------------------------------------------------------------------------------------------------
*/

params {
config_profile_name = 'Test profile for latest DIA'
config_profile_description = 'Minimal test dataset to check pipeline function for the data-independent acquisition pipeline branch for latest DIA-NN.'

// Limit resources so that this can run on GitHub Actions
max_cpus = 2
max_memory = 6.GB
max_time = 48.h

outdir = './results_latest_dia'

// Input data
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/quantms/testdata/dia_ci/PXD026600.sdrf.tsv'
database = 'https://raw.githubusercontent.com/nf-core/test-datasets/quantms/testdata/dia_ci/REF_EColi_K12_UPS1_combined.fasta'
diann_version = '1.9.beta.1'
min_pr_mz = 350
max_pr_mz = 950
min_fr_mz = 500
max_fr_mz = 1500
min_peptide_length = 15
max_peptide_length = 30
max_precursor_charge = 3
allowed_missed_cleavages = 1
diann_normalize = false
skip_post_msstats = false
publish_dir_mode = 'symlink'
max_mods = 2
}

process {
// thermorawfileparser
withName: 'NFCORE_QUANTMS:QUANTMS:FILE_PREPARATION:THERMORAWFILEPARSER' {
publishDir = [path: { "${params.outdir}/${task.process.tokenize(':')[-1].toLowerCase()}" }, pattern: "*.log" ]
}
}

8 changes: 6 additions & 2 deletions modules/local/assemble_empirical_library/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@ process ASSEMBLE_EMPIRICAL_LIBRARY {
tag "$meta.experiment_id"
label 'process_low'

container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
if (params.diann_version == "1.9.beta.1") {
container 'https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/tools/ghcr.io-bigbio-diann-1.9.1dev.sif'
} else {
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://containers.biocontainers.pro/s3/SingImgsRepo/diann/v1.8.1_cv1/diann_v1.8.1_cv1.img' :
'docker.io/biocontainers/diann:v1.8.1_cv1' }"
}

input:
// In this step the real files are passed, and not the names
Expand Down Expand Up @@ -58,7 +62,7 @@ process ASSEMBLE_EMPIRICAL_LIBRARY {
cat <<-END_VERSIONS > versions.yml
"${task.process}":
DIA-NN: \$(diann 2>&1 | grep "DIA-NN" | grep -oP "(\\d*\\.\\d+\\.\\d+)|(\\d*\\.\\d+)")
DIA-NN: \$(diann 2>&1 | grep "DIA-NN" | grep -oP "\\d+\\.\\d+(\\.\\w+)*(\\.[\\d]+)?")
END_VERSIONS
"""
}
9 changes: 6 additions & 3 deletions modules/local/diann_preliminary_analysis/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@ process DIANN_PRELIMINARY_ANALYSIS {
tag "$ms_file.baseName"
label 'process_high'

container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
if (params.diann_version == "1.9.beta.1") {
container 'https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/tools/ghcr.io-bigbio-diann-1.9.1dev.sif'
} else {
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://containers.biocontainers.pro/s3/SingImgsRepo/diann/v1.8.1_cv1/diann_v1.8.1_cv1.img' :
'docker.io/biocontainers/diann:v1.8.1_cv1' }"
}

input:
tuple val(meta), path(ms_file), path(predict_library)
Expand All @@ -13,7 +17,6 @@ process DIANN_PRELIMINARY_ANALYSIS {
path "*.quant", emit: diann_quant
tuple val(meta), path("*_diann.log"), emit: log
path "versions.yml", emit: version
path(ms_file), emit: preliminary_ms_file

when:
task.ext.when == null || task.ext.when
Expand Down Expand Up @@ -61,7 +64,7 @@ process DIANN_PRELIMINARY_ANALYSIS {
cat <<-END_VERSIONS > versions.yml
"${task.process}":
DIA-NN: \$(diann 2>&1 | grep "DIA-NN" | grep -oP "(\\d*\\.\\d+\\.\\d+)|(\\d*\\.\\d+)")
DIA-NN: \$(diann 2>&1 | grep "DIA-NN" | grep -oP "\\d+\\.\\d+(\\.\\w+)*(\\.[\\d]+)?")
END_VERSIONS
"""
}
12 changes: 9 additions & 3 deletions modules/local/diannsummary/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@ process DIANNSUMMARY {
tag "$meta.experiment_id"
label 'process_high'

container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
if (params.diann_version == "1.9.beta.1") {
container 'https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/tools/ghcr.io-bigbio-diann-1.9.1dev.sif'
} else {
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://containers.biocontainers.pro/s3/SingImgsRepo/diann/v1.8.1_cv1/diann_v1.8.1_cv1.img' :
'docker.io/biocontainers/diann:v1.8.1_cv1' }"
}

input:
// Note that the files are passed as names and not paths, this prevents them from being staged
Expand All @@ -24,7 +28,9 @@ process DIANNSUMMARY {
path "diann_report.gg_matrix.tsv", emit: gg_matrix
path "diann_report.unique_genes_matrix.tsv", emit: unique_gene_matrix
path "diannsummary.log", emit: log
path "empirical_library.tsv.speclib", emit: final_speclib
path "empirical_library.tsv", emit: final_speclib optional true
path "empirical_library.tsv.speclib", emit: final_tsv_speclib optional true
path "empirical_library.tsv.skyline.speclib", emit: skyline_speclib optional true
path "versions.yml", emit: version

when:
Expand Down Expand Up @@ -69,7 +75,7 @@ process DIANNSUMMARY {
cat <<-END_VERSIONS > versions.yml
"${task.process}":
DIA-NN: \$(diann 2>&1 | grep "DIA-NN" | grep -oP "(\\d*\\.\\d+\\.\\d+)|(\\d*\\.\\d+)")
DIA-NN: \$(diann 2>&1 | grep "DIA-NN" | grep -oP "\\d+\\.\\d+(\\.\\w+)*(\\.[\\d]+)?")
END_VERSIONS
"""
}
Loading

0 comments on commit 85f04b8

Please sign in to comment.