Merge pull request #387 from daichengxin/bug/samplecheck

Bug/samplecheck
bigbio · Jul 16, 2024 · c4e35a2 · c4e35a2
2 parents ee788b4 + d34ee73
commit c4e35a2
Show file tree

Hide file tree

Showing 9 changed files with 60 additions and 24 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,22 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [Unreleased] nfcore/quantms
+
+### `Added`
+
+- [#386](https://github.com/bigbio/quantms/pull/386) Make validation of ontology terms optional
+
+### `Changed`
+
+### `Fixed`
+
+### `Dependencies`
+
+### `Parameters`
+
+- `validate_ontologies`: enable or disable validating ontologies in the input SDRF file.
+
 ## [1.3.0] nfcore/quantms - [08/04/2024] - Santiago de Cuba
 
 ### `Added`

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -20,6 +20,7 @@ def parse_args(args=None):
     parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
     parser.add_argument("SDRF", help="SDRF/Expdesign file to be validated")
     parser.add_argument("ISSDRF", help="SDRF file or Expdesign file")
+    parser.add_argument("VALIDATE_ONTOLOGIES", help="Validate ontology terms.")
     parser.add_argument("--CHECK_MS", help="check mass spectrometry fields in SDRF.", action="store_true")
 
     return parser.parse_args(args)
@@ -44,20 +45,26 @@ def print_error(error, context="Line", context_str=""):
     sys.exit(1)
 
 
-def check_sdrf(check_ms, sdrf):
+def check_sdrf(check_ms, sdrf, validate_ontologies):
     df = SdrfDataFrame.parse(sdrf)
-    errors = df.validate(DEFAULT_TEMPLATE)
-    if check_ms:
-        errors = errors + df.validate(MASS_SPECTROMETRY)
-    for error in errors:
-        print(error)
-    if not errors:
-        print("Everying seems to be fine. Well done.")
+    if validate_ontologies:
+        errors = df.validate(DEFAULT_TEMPLATE)
+        if check_ms:
+            errors = errors + df.validate(MASS_SPECTROMETRY)
+        for error in errors:
+            print(error)
+        if not errors:
+            print("Everying seems to be fine. Well done.")
+        else:
+            print("There were validation errors!")
     else:
-        print("There were validation errors!")
+        errors = False
+        print("No ontology term validation was performed.")
+
     sys.exit(bool(errors))
 
 
+
 def check_expdesign(expdesign):
     data = pd.read_csv(expdesign, sep="\t", header=0, dtype=str)
     data = data.dropna()
@@ -117,7 +124,7 @@ def main(args=None):
     args = parse_args(args)
 
     if args.ISSDRF == "true":
-        check_sdrf(args.CHECK_MS, args.SDRF)
+        check_sdrf(args.CHECK_MS, args.SDRF, args.VALIDATE_ONTOLOGIES == "true")
     else:
         check_expdesign(args.SDRF)
 

diff --git a/bin/psm_conversion.py b/bin/psm_conversion.py
@@ -17,6 +17,8 @@
 
 
 def mods_position(peptide):
+    if peptide.startswith("."):
+        peptide = peptide[1:]
     pattern = re.compile(r"\((.*?)\)")
     original_mods = pattern.findall(peptide)
     peptide = re.sub(r"\(.*?\)", ".", peptide)

diff --git a/docs/output.md b/docs/output.md
@@ -14,16 +14,18 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 2. (optional) Decoy database generation for the provided DB (fasta) with OpenMS
 3. Database search with either MSGF+ and/or Comet through OpenMS adapters
 4. Re-mapping potentially identified peptides to the input database for consistency and error-checking (using OpenMS' PeptideIndexer)
-5. PSM rescoring using PSMFeatureExtractor and Percolator or a PeptideProphet-like distribution fitting approach in OpenMS
-6. If multiple search engines were chosen, the results are combined with OpenMS' ConsensusID
-7. If multiple search engines were chosen, a combined FDR is calculated
-8. Single run PSM/Peptide-level FDR filtering
-9. If localization of modifications was requested, Luciphor2 is applied via the OpenMS adapter
-10. (**DDA-LFQ**) Protein inference and label-free quantification based on spectral counting or MS1 feature detection, alignment and integration with OpenMS' ProteomicsLFQ. Performs an additional experiment-wide FDR filter on protein (and if requested peptide/PSM-level).
-11. (**DDA-ISO**) Extracts and normalizes isobaric labeling
-12. (**DDA-ISO**) Protein inference using the OpenMS ProteinInference tool. In addition, protein FDR filtering is performed in this step for Isobaric datasets (TMT, iTRAQ).
-13. (**DDA-ISO**) Protein Quantification
-14. Generation of QC reports using pMultiQC a library for QC proteomics data analysis.
+5. (optional) Performs LC-MS predictors such as MS²PIP and DeepLC to add new PSM features by MS2Rescore
+6. (optional) Merge different MS runs by samples or whole projects
+7. PSM rescoring using PSMFeatureExtractor and Percolator or a PeptideProphet-like distribution fitting approach in OpenMS
+8. If multiple search engines were chosen, the results are combined with OpenMS' ConsensusID
+9. If multiple search engines were chosen, a combined FDR is calculated
+10. Single run PSM/Peptide-level FDR filtering
+11. If localization of modifications was requested, Luciphor2 is applied via the OpenMS adapter
+12. (**DDA-LFQ**) Protein inference and label-free quantification based on spectral counting or MS1 feature detection, alignment and integration with OpenMS' ProteomicsLFQ. Performs an additional experiment-wide FDR filter on protein (and if requested peptide/PSM-level).
+13. (**DDA-ISO**) Extracts and normalizes isobaric labeling
+14. (**DDA-ISO**) Protein inference using the OpenMS ProteinInference tool. In addition, protein FDR filtering is performed in this step for Isobaric datasets (TMT, iTRAQ).
+15. (**DDA-ISO**) Protein Quantification
+16. Generation of QC reports using pMultiQC a library for QC proteomics data analysis.
 
 For DIA-LFQ experiments, the workflows is different:
 
@@ -100,7 +102,8 @@ different handling between peptide search engines.
 #### Identifications
 
 Intermediate output for the PSM/peptide-level filtered identifications per raw/mzML file happens in OpenMS'
-internal [idXML](https://github.com/OpenMS/OpenMS/blob/develop/share/OpenMS/SCHEMAS/IdXML_1_5.xsd) format. Only for DDA currently.
+internal [idXML](https://github.com/OpenMS/OpenMS/blob/develop/share/OpenMS/SCHEMAS/IdXML_1_5.xsd) format. quantms also provide csv output format in identification subworkflow.
+Only for DDA currently.
 
 #### Quantities
 

diff --git a/modules/local/ms2rescore/main.nf b/modules/local/ms2rescore/main.nf
@@ -2,7 +2,7 @@ process MS2RESCORE {
     tag "$meta.mzml_id"
     label 'process_high'
 
-    conda "bioconda::ms2rescore=3.0.3 bioconda::psm-utils=0.8.0 conda-forge::pydantic=1.10"
+    conda "bioconda::ms2rescore=3.0.3 bioconda::psm-utils=0.8.2 conda-forge::pydantic=1.10.14 pygam=0.9.1 bioconda::deeplc=2.2.27 bioconda::ms2pip=4.0.0.dev8 bioconda::deeplcretrainer=0.2.11 conda-forge::scikit-learn=1.4.2 conda-forge::scipy=1.13.0"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/ms2rescore:3.0.3--pyhdfd78af_0':
         'biocontainers/ms2rescore:3.0.3--pyhdfd78af_0' }"

diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf
@@ -13,6 +13,7 @@ process SAMPLESHEET_CHECK {
     input:
     path input_file
     val is_sdrf
+    val validate_ontologies
 
     output:
     path "*.log", emit: log
@@ -27,7 +28,7 @@ process SAMPLESHEET_CHECK {
     def args = task.ext.args ?: ''
 
     """
-    check_samplesheet.py "${input_file}" ${is_sdrf} --CHECK_MS 2>&1 | tee input_check.log
+    check_samplesheet.py "${input_file}" ${is_sdrf} ${validate_ontologies} --CHECK_MS 2>&1 | tee input_check.log
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/nextflow.config b/nextflow.config
@@ -19,6 +19,7 @@ params {
 
     // Input options
     input                      = null
+    validate_ontologies        = true
 
     // Tools flags
     posterior_probabilities  = 'percolator'

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -70,6 +70,12 @@
                     "description": "Whether export PSM from decoy in final identification results",
                     "fa_icon": "far fa-check-square",
                     "help_text": "Whether export PSM from decoy in final identification results for dda_id subworkflow for specific cases."
+                },
+                "validate_ontologies": {
+                    "type": "boolean",
+                    "description": "Check that ontology terms in an input SDRF file exist.",
+                    "fa_icon": "far fa-check-square",
+                    "help_text": "If false, only a basic readability check is performed on an input SDRF file. This option is useful when ontology providers are inaccessible."
                 }
             }
         },

diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
@@ -18,7 +18,7 @@ workflow INPUT_CHECK {
             exit 1
         }
     }
-    SAMPLESHEET_CHECK ( input_file, is_sdrf )
+    SAMPLESHEET_CHECK ( input_file, is_sdrf, params.validate_ontologies )
 
     emit:
     ch_input_file   = SAMPLESHEET_CHECK.out.checked_file