Merge pull request #389 from daichengxin/dev

diann 1.8.1 -> 1.9.1dev
bigbio · Jul 23, 2024 · 85f04b8 · 85f04b8
2 parents 94424c4 + 2e2f0aa
commit 85f04b8
Show file tree

Hide file tree

Showing 12 changed files with 150 additions and 57 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -43,6 +43,9 @@ jobs:
             exec_profile: conda
           - NXF_VER: "latest-everything"
             exec_profile: "conda"
+        include:
+          - test_profile: test_latest_dia
+            exec_profile: singularity
     steps:
       - name: Check out pipeline code
         uses: actions/checkout@v4
@@ -65,9 +68,11 @@ jobs:
           echo "$(pwd)/micromamba/bin" >> $GITHUB_PATH
           ./bin/micromamba shell init -s bash
           echo $'channels:\n  - conda-forge\n  - bioconda\n  - defaults\nuse_lockfiles: false' >> ~/.mambarc
-
-      - name: Run pipeline with test data
-        if: matrix.exec_profile != 'conda'
+      - name: Install Singularity with defaults
+        if: matrix.exec_profile == 'singularity'
+        uses: singularityhub/install-singularity@main
+      - name: Run pipeline with test data in docker profile
+        if: matrix.exec_profile == 'docker'
         # TODO nf-core: You can customise CI pipeline run tests as required
         # For example: adding multiple test runs with different parameters
         # Remember that you can parallelise this by using strategy.matrix
@@ -82,6 +87,14 @@ jobs:
         # Remember that you can parallelise this by using strategy.matrix
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile $TEST_PROFILE,micromamba --outdir ${TEST_PROFILE}_${EXEC_PROFILE}_results
+      - name: Run pipeline with test data in singularity profile
+        if: matrix.exec_profile == 'singularity'
+        # TODO nf-core: You can customise CI pipeline run tests as required
+        # For example: adding multiple test runs with different parameters
+        # Remember that you can parallelise this by using strategy.matrix
+        run: |
+          nextflow run ${GITHUB_WORKSPACE} -profile $TEST_PROFILE,$EXEC_PROFILE --outdir ${TEST_PROFILE}_${EXEC_PROFILE}_results
+
       - name: Gather failed logs
         if: failure() || cancelled()
         run: |

diff --git a/bin/diann_convert.py b/bin/diann_convert.py
@@ -292,7 +292,7 @@ def diann_version(self) -> str:
         return diann_version_id
 
     def validate_diann_version(self) -> None:
-        supported_diann_versions = ["1.8.1"]
+        supported_diann_versions = ["1.8.1", "1.9.beta.1"]
         if self.diann_version not in supported_diann_versions:
             raise ValueError(f"Unsupported DIANN version {self.diann_version}")
 
@@ -591,13 +591,13 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df):
 
     logger.debug("Classifying results type ...")
     pg["opt_global_result_type"] = "single_protein"
-    pg.loc[pg["Protein.Ids"].str.contains(";"), "opt_global_result_type"] = "indistinguishable_protein_group"
+    pg.loc[pg["Protein.Group"].str.contains(";"), "opt_global_result_type"] = "indistinguishable_protein_group"
 
     out_mztab_PRH = pg
     del pg
     out_mztab_PRH = out_mztab_PRH.drop(["Protein.Names"], axis=1)
     out_mztab_PRH.rename(
-        columns={"Protein.Group": "accession", "First.Protein.Description": "description"}, inplace=True
+        columns={"First.Protein.Description": "description"}, inplace=True
     )
     out_mztab_PRH.loc[:, "database"] = database
 
@@ -614,10 +614,10 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df):
         out_mztab_PRH.loc[:, i] = "null"
 
     logger.debug("Extracting accession values (keeping first)...")
-    out_mztab_PRH.loc[:, "accession"] = out_mztab_PRH.apply(lambda x: x["accession"].split(";")[0], axis=1)
+    out_mztab_PRH.loc[:, "accession"] = out_mztab_PRH.apply(lambda x: x["Protein.Group"].split(";")[0], axis=1)
 
     protein_details_df = out_mztab_PRH[out_mztab_PRH["opt_global_result_type"] == "indistinguishable_protein_group"]
-    prh_series = protein_details_df["Protein.Ids"].str.split(";", expand=True).stack().reset_index(level=1, drop=True)
+    prh_series = protein_details_df["Protein.Group"].str.split(";", expand=True).stack().reset_index(level=1, drop=True)
     prh_series.name = "accession"
     protein_details_df = (
         protein_details_df.drop("accession", axis=1).join(prh_series).reset_index().drop(columns="index")
@@ -644,14 +644,14 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df):
     # out_mztab_PRH.loc[out_mztab_PRH["opt_global_result_type"] == "single_protein", "ambiguity_members"] = "null"
     # or out_mztab_PRH.loc[out_mztab_PRH["Protein.Ids"] == out_mztab_PRH["accession"], "ambiguity_members"] = "null"
     out_mztab_PRH.loc[:, "ambiguity_members"] = out_mztab_PRH.apply(
-        lambda x: x["Protein.Ids"] if x["opt_global_result_type"] == "indistinguishable_protein_group" else "null",
+        lambda x: x["Protein.Group"] if x["opt_global_result_type"] == "indistinguishable_protein_group" else "null",
         axis=1,
     )
 
     logger.debug("Matching PRH to best search engine score...")
     score_looker = ModScoreLooker(report)
     out_mztab_PRH[["modifiedSequence", "best_search_engine_score[1]"]] = out_mztab_PRH.apply(
-        lambda x: score_looker.get_score(x["Protein.Ids"]), axis=1, result_type="expand"
+        lambda x: score_looker.get_score(x["Protein.Group"]), axis=1, result_type="expand"
     )
 
     logger.debug("Matching PRH to modifications...")
@@ -664,16 +664,16 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df):
     # This used to be a bottleneck in performance
     # This implementation drops the run time from 57s to 25ms
     protein_agg_report = (
-        report[["PG.MaxLFQ", "Protein.Ids", "study_variable"]]
-        .groupby(["study_variable", "Protein.Ids"])
+        report[["PG.MaxLFQ", "Protein.Group", "study_variable"]]
+        .groupby(["study_variable", "Protein.Group"])
         .agg({"PG.MaxLFQ": ["mean", "std", "sem"]})
         .reset_index()
-        .pivot(columns=["study_variable"], index="Protein.Ids")
+        .pivot(columns=["study_variable"], index="Protein.Group")
         .reset_index()
     )
     protein_agg_report.columns = ["::".join([str(s) for s in col]).strip() for col in protein_agg_report.columns.values]
     subname_mapper = {
-        "Protein.Ids::::": "Protein.Ids",
+        "Protein.Group::::": "Protein.Group",
         "PG.MaxLFQ::mean": "protein_abundance_study_variable",
         "PG.MaxLFQ::std": "protein_abundance_stdev_study_variable",
         "PG.MaxLFQ::sem": "protein_abundance_std_error_study_variable",
@@ -685,7 +685,7 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df):
     # Oddly enough the last implementation mapped the the accession (Q9NZJ9) in the mztab
     # to the Protein.Ids (A0A024RBG1;Q9NZJ9;Q9NZJ9-2), leading to A LOT of missing values.
     out_mztab_PRH = out_mztab_PRH.merge(
-        protein_agg_report, on="Protein.Ids", how="left", validate="many_to_one", copy=True
+        protein_agg_report, on="Protein.Group", how="left", validate="many_to_one", copy=True
     )
     del name_mapper
     del subname_mapper
@@ -694,7 +694,7 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df):
 
     out_mztab_PRH.loc[:, "PRH"] = "PRT"
     index = out_mztab_PRH.loc[:, "PRH"]
-    out_mztab_PRH.drop(["PRH", "Genes", "modifiedSequence", "Protein.Ids"], axis=1, inplace=True)
+    out_mztab_PRH.drop(["PRH", "Genes", "modifiedSequence", "Protein.Group"], axis=1, inplace=True)
     out_mztab_PRH.insert(0, "PRH", index)
     out_mztab_PRH.fillna("null", inplace=True)
     out_mztab_PRH.loc[:, "database"] = database
@@ -734,12 +734,12 @@ def mztab_PEH(
     out_mztab_PEH = pd.DataFrame()
     out_mztab_PEH = pr.iloc[:, 0:10]
     out_mztab_PEH.drop(
-        ["Protein.Group", "Protein.Names", "First.Protein.Description", "Proteotypic"], axis=1, inplace=True
+        ["Protein.Ids", "Protein.Names", "First.Protein.Description", "Proteotypic"], axis=1, inplace=True
     )
     out_mztab_PEH.rename(
         columns={
             "Stripped.Sequence": "sequence",
-            "Protein.Ids": "accession",
+            "Protein.Group": "accession",
             "Modified.Sequence": "opt_global_cv_MS:1000889_peptidoform_sequence",
             "Precursor.Charge": "charge",
         },
@@ -909,7 +909,7 @@ def __find_info(directory, n):
     out_mztab_PSH = out_mztab_PSH[
         [
             "Stripped.Sequence",
-            "Protein.Ids",
+            "Protein.Group",
             "Q.Value",
             "RT.Start",
             "Precursor.Charge",
@@ -1014,7 +1014,7 @@ def classify_result_type(target):
     :return: A string implys protein type
     :rtype: str
     """
-    if ";" in target["Protein.Ids"]:
+    if ";" in target["Protein.Group"]:
         return "indistinguishable_protein_group"
     return "single_protein"
 
@@ -1056,7 +1056,7 @@ def match_in_report(report, target, max_, flag, level):
         return tuple(q_value)
 
     if flag == 1 and level == "protein":
-        result = report[report["Protein.Ids"] == target]
+        result = report[report["Protein.Group"] == target]
         PRH_params = []
         for i in range(1, max_ + 1):
             match = result[result["study_variable"] == i]
@@ -1083,19 +1083,19 @@ def __init__(self, report: pd.DataFrame) -> None:
 
     def make_lookup_dict(self, report) -> Dict[str, Tuple[str, float]]:
         grouped_df = (
-            report[["Modified.Sequence", "Protein.Ids", "Global.PG.Q.Value"]]
+            report[["Modified.Sequence", "Protein.Group", "Global.PG.Q.Value"]]
             .sort_values("Global.PG.Q.Value", ascending=True)
-            .groupby(["Protein.Ids"])
+            .groupby(["Protein.Group"])
             .head(1)
         )
-        #        Modified.Sequence               Protein.Ids  Global.PG.Q.Value
+        #        Modified.Sequence               Protein.Group  Global.PG.Q.Value
         # 78265          LFNEQNFFQR  Q8IV63;Q8IV63-2;Q8IV63-3           0.000252
         # 103585    NPTIVNFPITNVDLR           Q53GS9;Q53GS9-2           0.000252
         # 103586          NPTWKPLIR           Q7Z4Q2;Q7Z4Q2-2           0.000252
         # 103588      NPVGYPLAWQFLR           Q9NZ08;Q9NZ08-2           0.000252
 
         out = {
-            row["Protein.Ids"]: (row["Modified.Sequence"], row["Global.PG.Q.Value"]) for _, row in grouped_df.iterrows()
+            row["Protein.Group"]: (row["Modified.Sequence"], row["Global.PG.Q.Value"]) for _, row in grouped_df.iterrows()
         }
         return out
 
@@ -1325,17 +1325,17 @@ def calculate_protein_coverages(report: pd.DataFrame, out_mztab_PRH: pd.DataFram
     protein in the PRH table (defined by accession, not protein.ids).
     """
     nested_df = (
-        report[["Protein.Ids", "Stripped.Sequence"]]
-        .groupby("Protein.Ids")
+        report[["Protein.Group", "Stripped.Sequence"]]
+        .groupby("Protein.Group")
         .agg({"Stripped.Sequence": set})
         .reset_index()
     )
-    #                      Protein.Ids                                  Stripped.Sequence
+    #                      Protein.Group                                  Stripped.Sequence
     # 0     A0A024RBG1;Q9NZJ9;Q9NZJ9-2                                   {SEQEDEVLLVSSSR}
     # 1        A0A096LP49;A0A096LP49-2                                  {SPWAMTERKHSSLER}
     # 2                A0AVT1;A0AVT1-2  {EDFTLLDFINAVK, KPDHVPISSEDER, QDVIITALDNVEAR,...
-    ids_to_seqs = dict(zip(nested_df["Protein.Ids"], nested_df["Stripped.Sequence"]))
-    acc_to_ids = dict(zip(out_mztab_PRH["accession"], out_mztab_PRH["Protein.Ids"]))
+    ids_to_seqs = dict(zip(nested_df["Protein.Group"], nested_df["Stripped.Sequence"]))
+    acc_to_ids = dict(zip(out_mztab_PRH["accession"], out_mztab_PRH["Protein.Group"]))
     fasta_id_to_seqs = dict(zip(fasta_df["id"], fasta_df["seq"]))
     acc_to_fasta_ids: dict = {}
 

diff --git a/bin/psm_conversion.py b/bin/psm_conversion.py
@@ -10,7 +10,7 @@
 
 _parquet_field = [
     "sequence", "protein_accessions", "protein_start_positions", "protein_end_positions",
-    "modifications", "retention_time", "charge", "calc_mass_to_charge", "reference_file_name",
+    "modifications", "retention_time", "charge", "exp_mass_to_charge", "reference_file_name",
     "scan_number", "peptidoform", "posterior_error_probability", "global_qvalue", "is_decoy",
     "consensus_support", "mz_array", "intensity_array", "num_peaks", "search_engines", "id_scores", "hit_rank"
 ]
@@ -61,7 +61,7 @@ def convert_psm(idxml, spectra_file, export_decoy_psm):
 
     for peptide_id in pep_ids:
         retention_time = peptide_id.getRT()
-        calc_mass_to_charge = peptide_id.getMZ()
+        exp_mass_to_charge = peptide_id.getMZ()
         scan_number = int(re.findall(r"(spectrum|scan)=(\d+)", peptide_id.getMetaValue("spectrum_reference"))[0][1])
 
         if isinstance(spectra_df, pd.DataFrame):
@@ -101,7 +101,7 @@ def convert_psm(idxml, spectra_file, export_decoy_psm):
             hit_rank = hit.getRank()
 
             parquet_data.append([sequence, protein_accessions, protein_start_positions, protein_end_positions,
-                                 modifications, retention_time, charge, calc_mass_to_charge, reference_file_name,
+                                 modifications, retention_time, charge, exp_mass_to_charge, reference_file_name,
                                  scan_number, peptidoform, posterior_error_probability, global_qvalue, is_decoy,
                                  consensus_support, mz_array, intensity_array, num_peaks, search_engines, id_scores,
                                  hit_rank])

diff --git a/conf/test_latest_dia.config b/conf/test_latest_dia.config
@@ -0,0 +1,48 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests (DIA)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple test.
+
+    Use as follows:
+        nextflow run nf-core/quantms -profile test_dia,<docker/singularity> [--outdir <OUTDIR>]
+
+------------------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Test profile for latest DIA'
+    config_profile_description = 'Minimal test dataset to check pipeline function for the data-independent acquisition pipeline branch for latest DIA-NN.'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus = 2
+    max_memory = 6.GB
+    max_time = 48.h
+
+    outdir = './results_latest_dia'
+
+    // Input data
+    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/quantms/testdata/dia_ci/PXD026600.sdrf.tsv'
+    database = 'https://raw.githubusercontent.com/nf-core/test-datasets/quantms/testdata/dia_ci/REF_EColi_K12_UPS1_combined.fasta'
+    diann_version = '1.9.beta.1'
+    min_pr_mz = 350
+    max_pr_mz = 950
+    min_fr_mz = 500
+    max_fr_mz = 1500
+    min_peptide_length = 15
+    max_peptide_length = 30
+    max_precursor_charge = 3
+    allowed_missed_cleavages = 1
+    diann_normalize = false
+    skip_post_msstats = false
+    publish_dir_mode = 'symlink'
+    max_mods = 2
+}
+
+process {
+    // thermorawfileparser
+    withName: 'NFCORE_QUANTMS:QUANTMS:FILE_PREPARATION:THERMORAWFILEPARSER' {
+        publishDir  = [path: { "${params.outdir}/${task.process.tokenize(':')[-1].toLowerCase()}" }, pattern: "*.log" ]
+    }
+}
+
diff --git a/modules/local/assemble_empirical_library/main.nf b/modules/local/assemble_empirical_library/main.nf
@@ -2,9 +2,13 @@ process ASSEMBLE_EMPIRICAL_LIBRARY {
     tag "$meta.experiment_id"
     label 'process_low'
 
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+    if (params.diann_version == "1.9.beta.1") {
+        container 'https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/tools/ghcr.io-bigbio-diann-1.9.1dev.sif'
+    } else {
+        container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://containers.biocontainers.pro/s3/SingImgsRepo/diann/v1.8.1_cv1/diann_v1.8.1_cv1.img' :
         'docker.io/biocontainers/diann:v1.8.1_cv1' }"
+    }
 
     input:
     // In this step the real files are passed, and not the names
@@ -58,7 +62,7 @@ process ASSEMBLE_EMPIRICAL_LIBRARY {
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        DIA-NN: \$(diann 2>&1 | grep "DIA-NN" | grep -oP "(\\d*\\.\\d+\\.\\d+)|(\\d*\\.\\d+)")
+        DIA-NN: \$(diann 2>&1 | grep "DIA-NN" | grep -oP "\\d+\\.\\d+(\\.\\w+)*(\\.[\\d]+)?")
     END_VERSIONS
     """
 }
diff --git a/modules/local/diann_preliminary_analysis/main.nf b/modules/local/diann_preliminary_analysis/main.nf
@@ -2,9 +2,13 @@ process DIANN_PRELIMINARY_ANALYSIS {
     tag "$ms_file.baseName"
     label 'process_high'
 
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+    if (params.diann_version == "1.9.beta.1") {
+        container 'https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/tools/ghcr.io-bigbio-diann-1.9.1dev.sif'
+    } else {
+        container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://containers.biocontainers.pro/s3/SingImgsRepo/diann/v1.8.1_cv1/diann_v1.8.1_cv1.img' :
         'docker.io/biocontainers/diann:v1.8.1_cv1' }"
+    }
 
     input:
     tuple val(meta), path(ms_file), path(predict_library)
@@ -13,7 +17,6 @@ process DIANN_PRELIMINARY_ANALYSIS {
     path "*.quant", emit: diann_quant
     tuple val(meta), path("*_diann.log"), emit: log
     path "versions.yml", emit: version
-    path(ms_file), emit: preliminary_ms_file
 
     when:
     task.ext.when == null || task.ext.when
@@ -61,7 +64,7 @@ process DIANN_PRELIMINARY_ANALYSIS {
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        DIA-NN: \$(diann 2>&1 | grep "DIA-NN" | grep -oP "(\\d*\\.\\d+\\.\\d+)|(\\d*\\.\\d+)")
+        DIA-NN: \$(diann 2>&1 | grep "DIA-NN" | grep -oP "\\d+\\.\\d+(\\.\\w+)*(\\.[\\d]+)?")
     END_VERSIONS
     """
 }
diff --git a/modules/local/diannsummary/main.nf b/modules/local/diannsummary/main.nf
@@ -2,9 +2,13 @@ process DIANNSUMMARY {
     tag "$meta.experiment_id"
     label 'process_high'
 
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+    if (params.diann_version == "1.9.beta.1") {
+        container 'https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/tools/ghcr.io-bigbio-diann-1.9.1dev.sif'
+    } else {
+        container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://containers.biocontainers.pro/s3/SingImgsRepo/diann/v1.8.1_cv1/diann_v1.8.1_cv1.img' :
         'docker.io/biocontainers/diann:v1.8.1_cv1' }"
+    }
 
     input:
     // Note that the files are passed as names and not paths, this prevents them from being staged
@@ -24,7 +28,9 @@ process DIANNSUMMARY {
     path "diann_report.gg_matrix.tsv", emit: gg_matrix
     path "diann_report.unique_genes_matrix.tsv", emit: unique_gene_matrix
     path "diannsummary.log", emit: log
-    path "empirical_library.tsv.speclib", emit: final_speclib
+    path "empirical_library.tsv", emit: final_speclib optional true
+    path "empirical_library.tsv.speclib", emit: final_tsv_speclib optional true
+    path "empirical_library.tsv.skyline.speclib", emit: skyline_speclib optional true
     path "versions.yml", emit: version
 
     when:
@@ -69,7 +75,7 @@ process DIANNSUMMARY {
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        DIA-NN: \$(diann 2>&1 | grep "DIA-NN" | grep -oP "(\\d*\\.\\d+\\.\\d+)|(\\d*\\.\\d+)")
+        DIA-NN: \$(diann 2>&1 | grep "DIA-NN" | grep -oP "\\d+\\.\\d+(\\.\\w+)*(\\.[\\d]+)?")
     END_VERSIONS
     """
 }