diff --git a/.github/workflows/run_pytests.yml b/.github/workflows/run_pytests.yml index 7b3d56f..965b212 100644 --- a/.github/workflows/run_pytests.yml +++ b/.github/workflows/run_pytests.yml @@ -7,6 +7,7 @@ on: branches: - main - development + - "*hotfix*" pull_request: types: [ opened, synchronize] # Allows you to run this workflow manually from the Actions tab diff --git a/CHANGELOG.md b/CHANGELOG.md index c29db2b..528ef67 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,31 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.3.4] + +### Changed + +- Table updates (associated with updating ISA archive files) now separates multiple files in a field with ',' instead of ', ' + +## [1.3.3] + +### Added + +- Support for data asset key sets and run components in updated validation interface (i.e. by 'dpt validation') + +## [1.3.2] + +### Fixed + +- Refactored ISA archive parsing functions as prior the fallback wasn't being used in all calls (specifically the plug in based ones) + +## [1.3.1] + +### Fixed + +- Parsing for ISA Archives met 'ISO-8859-1' encoding but not 'utf-8' + - Specifically, 'utf-8' is attempted and 'ISO-8859-1' is used as a fallback + ## [1.3.0] ### Added @@ -170,3 +195,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 [1.2.0]: https://github.com/j-81/dp_tools/compare/1.1.9...1.2.0 [1.2.1]: https://github.com/j-81/dp_tools/compare/1.2.0...1.2.1 [1.3.0]: https://github.com/j-81/dp_tools/compare/1.2.1...1.3.0 +[1.3.1]: https://github.com/j-81/dp_tools/compare/1.3.0...1.3.1 +[1.3.2]: https://github.com/j-81/dp_tools/compare/1.3.1...1.3.2 +[1.3.3]: https://github.com/j-81/dp_tools/compare/1.3.2...1.3.3 +[1.3.4]: https://github.com/j-81/dp_tools/compare/1.3.3...1.3.4 diff --git a/INSTRUCTIONS.md b/INSTRUCTIONS.md new file mode 100644 index 0000000..1c304b2 --- /dev/null +++ b/INSTRUCTIONS.md @@ -0,0 +1,60 @@ +# This document explains usage from Gitpod; however, beside installation, these may (untested) also work when running from containers (wrapped in appropriate `singularity` or `docker` invocations) + + +## Installation + +``` +cd $REPO_DIRECTORY # e.g. /workspace/dp_tools in gitpod +pip install -e . +``` + +## Download Relevant MultiQC & ISA archive + +> python download_multiqc_from_OSD.py --osd-id --output-dir + +### Known Limitations / Issues + +* Only supports datasets with `read distribution` MultiQC files (used as a proxy for whether the dataset is actually sequencing transcriptomics) +** Future: Should rely on parsing metadata from API + +## Copy required configuration files + +> bash set_up_config_files.sh + +This copies template yaml files from the repository code. + +## CD into directory + +> cd + +## Modify configuration files + + +### isa_config.yaml + +1. Initially, no changes +2. If encountering error like: `ValueError: Could not find required column '['Parameter Value[Stranded]', 'Parameter Value[stranded]']' in either ISA sample or assay table.` + * Comment out or modify item in `Staging: -> General: -> Required Metadata: -> From ISA:` section of yaml + +### extraction_settings.yaml + +1. MUST: change root search directory (line 2) to directory containing multiQC reports generated at start of this document +1. MAY: need to disable section for certain multiQC (not likely useful / will very probably break summarization) + +## Run extract & summarize script + +> python ../extract_dataset.py --osd-id # You should still be in the directory with the multiQC outputs & yaml files + +Outputs: + +1. _metrics.csv # Exhaustive metrics as pulled from multiQC reports +2. _summary.csv # Summarization and derived statistics as generated on the exhaustive metrics table + + +## Overall Known Limitations + +* Currently only supports paired end sequencing transcriptomics + * Updating this will require updating both extraction & summarization code + +* Certain ISA archives may not work + * While most missing or encoded off-spec metadata can be addressed by disabling (commenting out) sections in `extraction_settings.yaml`, certain ones like missing `library layout` (unlikley but an example) will likely require more significant changes to accomodate. diff --git a/OSD-201/extraction_settings.yaml b/OSD-201/extraction_settings.yaml new file mode 100644 index 0000000..e55a71c --- /dev/null +++ b/OSD-201/extraction_settings.yaml @@ -0,0 +1,130 @@ +Extraction Settings: + root search directory: "/workspace/dp_tools/OSD-201" + sections: + - name: "raw reads" + enabled: True + multiQC: + from json: + - "raw_multiqc_report" + - "raw_multiqc_data" + - "multiqc_data.json" + search recursively: False + logs directory: + - "00-RawData" + - "FastQC_Reports" + logs pattern(s): + - "*fastqc.zip" + modules: + - "fastqc" + + - name: "trimmed reads" + enabled: True + multiQC: + from json: + - "trimmed_multiqc_report" + - "trimmed_multiqc_data" + - "multiqc_data.json" + search recursively: False + logs directory: + - "01-TG_Preproc" + - "FastQC_Reports" + logs pattern(s): + - "*fastqc.zip" + modules: + - "fastqc" + + - name: "aligned reads" + enabled: True + multiQC: + from json: + - "align_multiqc_report" + - "align_multiqc_data" + - "multiqc_data.json" + search recursively: True + logs directory: + - "02-STAR_Alignment" + logs pattern(s): + - "*Log.final.out" + modules: + - "star" + + - name: "rseqc: genebody coverage" + enabled: True + multiQC: + from json: + - "geneBody_cov_multiqc_report" + - "geneBody_cov_multiqc_data" + - "multiqc_data.json" + search recursively: True + logs directory: + - "RSeQC_Analyses" + - "02_geneBody_coverage" + logs pattern(s): + - "*.geneBodyCoverage.txt" + modules: + - "rseqc" + + - name: "rseqc: infer experiment" + enabled: True + multiQC: + from json: + - "infer_exp_multiqc_report" + - "infer_exp_multiqc_data" + - "multiqc_data.json" + search recursively: True + logs directory: + - "RSeQC_Analyses" + - "03_infer_experiment" + logs pattern(s): + - "*infer_expt.out" + modules: + - "rseqc" + + - name: "rseqc: inner distance" + enabled: True + multiQC: + from json: + - "inner_dist_multiqc_report" + - "inner_dist_multiqc_data" + - "multiqc_data.json" + search recursively: True + logs directory: + - "RSeQC_Analyses" + - "04_inner_distance" + logs pattern(s): + - "*inner_distance.txt" + modules: + - "rseqc" + + - name: "rseqc: read distribution" + enabled: True + multiQC: + from json: + - "read_dist_multiqc_report" + - "read_dist_multiqc_data" + - "multiqc_data.json" + search recursively: True + logs directory: + - "RSeQC_Analyses" + - "05_read_distribution" + logs pattern(s): + - "*read_dist.out" + modules: + - "rseqc" + + + - name: "rsem count" + enabled: True + multiQC: + from json: + - "RSEM_count_multiqc_report" + - "RSEM_count_multiqc_data" + - "multiqc_data.json" + search recursively: True + logs directory: + - "03-RSEM_Counts" + logs pattern(s): + - "*.stat" + modules: + - "rsem" + diff --git a/OSD-201/isa_config.yaml b/OSD-201/isa_config.yaml new file mode 100644 index 0000000..fc6b92e --- /dev/null +++ b/OSD-201/isa_config.yaml @@ -0,0 +1,1321 @@ +# TOP LEVEL +NAME: "bulkRNASeq" +VERSION: "1" + +# anchors for reuse +_anchors: + rawDataDir: &rawDataDir "00-RawData" + trimDataDir: &trimDataDir "01-TG_Preproc" + alignDataDir: &alignDataDir "02-STAR_Alignment" + countsDataDir: &countsDataDir "03-RSEM_Counts" + normCountsDataDir: &normCountsDataDir "04-DESeq2_NormCounts" + DGEDataDir: &DGEDataDir "05-DESeq2_DGE" + rseqcDataDir: &rseqcDataDir "RSeQC_Analyses" # DISCUSS: Should this be renamed to "RSeQC_Analyses" for consistent casing? -J.O. , this has been renamed and differs from the recent bash based processings + ERCCAnalysisDir: &ERCCAnalysisDir "ERCC_Analysis" + FastQC_Reports: &FastQC_Reports "FastQC_Reports" + neverPublished: &neverPublished + subcategory: null + subdirectory: null + publish to repo: false + include subdirectory in table: false + table order: -1 + +Staging: + General: + Required Metadata: + From ISA: + - ISA Field Name: Study Protocol Type + ISA Table Source: Investigation + Investigation Subtable: STUDY PROTOCOLS + # will return a boolean indicating if any of the following includes + True If Includes At Least One: + - spike-in quality control role + - spike-in protocol + - spike-in control + - spike-in control protocol + Runsheet Column Name: has_ERCC + Processing Usage: >- + Indicates is ERCC spike-in has been added. This can be automatically + determined from the ISA archive as well based on 'Study Protocol Name' and 'Study Protocol Type' + Example: 'TRUE' + + - ISA Field Name: + - Characteristics[Organism] + - Characteristics[organism] + ISA Table Source: Sample + Runsheet Column Name: organism + Processing Usage: >- + Mapping to the appropriate alignment reference and annotation databases. + Example: Arabidopsis thaliana + + - ISA Field Name: + - Characteristics[Material Type] + # - Characteristics[organism] + ISA Table Source: Assay + Runsheet Column Name: Tissue Type + Processing Usage: >- + Used for metrics table + Example: Left retina + + - ISA Field Name: + - Parameter Value[library selection] + # - Characteristics[organism] + ISA Table Source: Sample + Runsheet Column Name: Library Prep Method + Processing Usage: >- + Used for metrics table + Example: Left retina + + - ISA Field Name: Sample Name + ISA Table Source: Assay + Runsheet Column Name: sample_name + Runsheet Index: true + Processing Usage: >- + Sample name is used as a unique sample identifier during processing + Example: Atha_Col-0_Root_WT_Ctrl_45min_Rep1_GSM502538 + + - ISA Field Name: + - Parameter Value[library layout] + - Parameter Value[Library Layout] + ISA Table Source: Assay + Runsheet Column Name: PE or SE + Remapping: {"PAIRED":'PE', "Paired":'PE', "SINGLE":'SE'} + Processing Usage: >- + Used for metrics table + Example: 'PE' + + # - ISA Field Name: + # - Parameter Value[Stranded] + # - Parameter Value[stranded] + # ISA Table Source: Assay + # Runsheet Column Name: Stranded or Unstranded + # # Remapping: {"PAIRED":'PE', "Paired":'PE', "SINGLE":'SE'} + # Processing Usage: >- + # Used for metrics table + # Example: 'STRANDED' + + - ISA Field Name: + - Parameter Value[rRNA Contamination] + # - Parameter Value[Library Layout] + ISA Table Source: Assay + Runsheet Column Name: '% rRNA contamination' + # Append Column Following: "Unit" + Processing Usage: >- + Used for metrics table + Example: '13.212 percent' + + # this entry denotes the following: + # retrive from that ISA field name + # multiple values (separated by ",") + # index those to certain runsheet columns + # if the index doesn't exist, optional prevents raising an exception + # GLDS URL Mapping means the names are searched against the GLDS filelisting json for urls + # an exception will be raised if one and only one url is not mapped to each filename + # - ISA Field Name: + # - Parameter Value[Merged Sequence Data File] + # - Characteristics[Merged Sequence Data File] + # - Raw Data File + # ISA Table Source: Assay + # Multiple Values Per Entry: true + # Multiple Values Delimiter: '\s*,\s*' # whitespace surrounded comma + # Runsheet Column Name: + # - {'name':'read1_path', 'index':0} + # - {'name':'read2_path', 'index':1, 'optional':true} + # GLDS URL Mapping: true + # Processing Usage: >- + # Location to the raw data fastq file. May be a url or local path. + # Example: 'https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-194_rna...' + + # - ISA Field Name: Factor Value[{factor_name}] + # ISA Table Source: [Assay, Sample] + # Runsheet Column Name: Factor Value[{factor_name}] + # Matches Multiple Columns: true + # Match Regex: "Factor Value\\[.*\\]" + # Append Column Following: "Unit" + # Processing Usage: >- + # Factor values in a study. Used to assign experimental groups for each sample. + # Note: On the runsheet, a subsequent 'Unit' Column value will be + # suffix-concatenated if it exists. + # Example: Basal Control + + - ISA Field Name: Unit + ISA Table Source: [Assay, Sample] + Runsheet Column Name: null + Matches Multiple Columns: true + Autoload: false # handled by factor value loading above + Processing Usage: >- + Unit to be suffix-concatenated onto prior Factor value columns. + Example: day + + From User: + # Removed since unused by Processing via the runsheet + # - Runsheet Column Name: GLDS + # Processing Usage: >- + # The GLDS accession number + # Example: GLDS-205 + + - Runsheet Column Name: read1_path + # used to generate candidate file names for searching GLDS repository filelisting + Data Asset Keys: ["raw forward reads fastq GZ", "raw reads fastq GZ"] + Processing Usage: >- + The location of either the forward reads (paired end) or only reads file (single end) + raw fastq file. Can be either a url or local path. + Note: For GLDS raw data assets, either the filelisting json API or the OpenAPI + may be used to retrieve urls given the array data filename (sourced from ISA archive). + Example: /some/local/path OR https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-123_microarray_E-MTAB-3289.raw.1.zip?version=1 + + + - Runsheet Column Name: read2_path + Data Asset Keys: ["raw reverse reads fastq GZ"] + Processing Usage: >- + The location of either the reverse reads (paired end) + raw fastq file. Can be either a url or local path. + For single end studies, this should be an empty string. + Note: For GLDS raw data assets, either the filelisting json API or the OpenAPI + may be used to retrieve urls given the array data filename (sourced from ISA archive). + Example: /some/local/path OR https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-123_microarray_E-MTAB-3289.raw.1.zip?version=1 + +ISA Meta: + Valid Study Assay Technology And Measurement Types: + - measurement: "transcription profiling" + technology: "RNA Sequencing (RNA-Seq)" + + # this is prepended to all file names in the curation assay table + Global file prefix: "{datasystem}_rna_seq_" + + # configuration related to updating investigation file + # each must refer to a STUDY PROCESS in the 'ISA_investigation.yaml' file + # LEADCAP_organism should be the studied organisms scientific name with a leading cap + Post Processing Add Study Protocol: + GeneLab RNAseq data processing protocol::{LEADCAP_organism} V1 + +data assets: + runsheet: + processed location: + - "Metadata" + - "{dataset}_bulkRNASeq_v1_runsheet.csv" + + tags: + - raw + + resource categories: *neverPublished + + ISA Archive: + processed location: + - "Metadata" + - "*-ISA.zip" + + tags: + - raw + + resource categories: *neverPublished + + raw MultiQC directory: + processed location: + - *rawDataDir + - *FastQC_Reports + - "raw_multiqc_report" + + tags: + - raw + + resource categories: *neverPublished + + raw MultiQC directory ZIP: + processed location: + - *rawDataDir + - *FastQC_Reports + - "raw_multiqc_report.zip" + + tags: + - raw + + resource categories: &MergedSequenceData_MultiQCReports + subcategory: Merged Sequence Data + subdirectory: Multiqc Reports + publish to repo: true + include subdirectory in table: true + table order: 1 + + raw forward reads fastq GZ: + processed location: + - *rawDataDir + - "Fastq" + - "{sample}_R1_raw.fastq.gz" + + tags: + - raw + + resource categories: &MergedSequenceData_Fastq + subcategory: Merged Sequence Data + subdirectory: Fastq + publish to repo: true + include subdirectory in table: false + table order: 0 + + raw reverse reads fastq GZ: + processed location: + - *rawDataDir + - "Fastq" + - "{sample}_R2_raw.fastq.gz" + + tags: + - raw + + resource categories: *MergedSequenceData_Fastq + + raw reads fastq GZ: + processed location: + - *rawDataDir + - "Fastq" + - "{sample}_raw.fastq.gz" + + tags: + - raw + + resource categories: *MergedSequenceData_Fastq + + raw forward reads fastQC HTML: + processed location: + - *rawDataDir + - *FastQC_Reports + - "{sample}_R1_raw_fastqc.html" + + tags: + - raw + + resource categories: *neverPublished + + # J.Oribello: We should revisit this, fastQC includes some unique (not parsed + # into multiQC) relevant information like the actual overrepresented sequence strings + raw reverse reads fastQC HTML: + processed location: + - *rawDataDir + - *FastQC_Reports + - "{sample}_R2_raw_fastqc.html" + + tags: + - raw + + resource categories: *neverPublished + + raw reads fastQC HTML: + processed location: + - *rawDataDir + - *FastQC_Reports + - "{sample}_raw_fastqc.html" + + tags: + - raw + + resource categories: *neverPublished + + raw forward reads fastQC ZIP: + processed location: + - *rawDataDir + - *FastQC_Reports + - "{sample}_R1_raw_fastqc.zip" + + tags: + - raw + + resource categories: *neverPublished + + raw reverse reads fastQC ZIP: + processed location: + - *rawDataDir + - *FastQC_Reports + - "{sample}_R2_raw_fastqc.zip" + + tags: + - raw + + resource categories: *neverPublished + + raw reads fastQC ZIP: + processed location: + - *rawDataDir + - *FastQC_Reports + - "{sample}_raw_fastqc.zip" + + tags: + - raw + + resource categories: *neverPublished + + trimmed fastQC MultiQC directory: + processed location: + - *trimDataDir + - *FastQC_Reports + - "trimmed_multiqc_report" + + tags: + - processed + + resource categories: *neverPublished + + trimmed fastQC MultiQC directory ZIP: + processed location: + - *trimDataDir + - *FastQC_Reports + - "trimmed_multiqc_report.zip" + + tags: + - processed + + resource categories: &TrimmedSequenceData_MultiQCReports + subcategory: Trimmed Sequence Data + subdirectory: Multiqc Reports + publish to repo: true + include subdirectory in table: true + table order: 4 + + trimmed forward reads fastq GZ: &trimmedFastqGZ + processed location: + - *trimDataDir + - "Fastq" + - "{sample}_R1_trimmed.fastq.gz" + + tags: + - processed + + resource categories: + subcategory: Trimmed Sequence Data + subdirectory: Fastq + publish to repo: true + include subdirectory in table: false + table order: 3 + + trimmed reverse reads fastq GZ: + <<: *trimmedFastqGZ + processed location: + - *trimDataDir + - "Fastq" + - "{sample}_R2_trimmed.fastq.gz" + + tags: + - processed + + trimmed reads fastq GZ: + <<: *trimmedFastqGZ + processed location: + - *trimDataDir + - "Fastq" + - "{sample}_trimmed.fastq.gz" + + tags: + - processed + + trimmed forward reads fastQC HTML: &trimmedForwardReadsFastQCHTML + processed location: + - *trimDataDir + - *FastQC_Reports + - "{sample}_R1_trimmed_fastqc.html" + + tags: + - processed + + resource categories: *neverPublished + + trimmed reverse reads fastQC HTML: + <<: *trimmedForwardReadsFastQCHTML + processed location: + - *trimDataDir + - *FastQC_Reports + - "{sample}_R2_trimmed_fastqc.html" + + tags: + - processed + + trimmed reads fastQC HTML: + <<: *trimmedForwardReadsFastQCHTML + processed location: + - *trimDataDir + - *FastQC_Reports + - "{sample}_trimmed_fastqc.html" + + tags: + - processed + + trimmed forward reads fastQC ZIP: &trimmedForwardReadsFastQCZIP + processed location: + - *trimDataDir + - *FastQC_Reports + - "{sample}_R1_trimmed_fastqc.zip" + + tags: + - processed + + resource categories: *neverPublished + + trimmed reverse reads fastQC ZIP: + <<: *trimmedForwardReadsFastQCZIP + processed location: + - *trimDataDir + - *FastQC_Reports + - "{sample}_R2_trimmed_fastqc.zip" + + tags: + - processed + + trimmed reads fastQC ZIP: + <<: *trimmedForwardReadsFastQCZIP + processed location: + - *trimDataDir + - *FastQC_Reports + - "{sample}_trimmed_fastqc.zip" + + tags: + - processed + + trimming MultiQC directory: + processed location: + - *trimDataDir + - &trimmingReportsDir "Trimming_Reports" + - "trimming_multiqc_report" + + tags: + - processed + + resource categories: *neverPublished + + forward reads trimming report: &trimmedForwardReadsFastQCTrimmingReport + processed location: + - *trimDataDir + - *trimmingReportsDir + - "{sample}_R1_raw.fastq.gz_trimming_report.txt" + + tags: + - processed + + resource categories: + subcategory: Trimmed Sequence Data + subdirectory: Trimming Reports + publish to repo: true + include subdirectory in table: true + table order: 5 + + reverse reads trimming report: + <<: *trimmedForwardReadsFastQCTrimmingReport + processed location: + - *trimDataDir + - *trimmingReportsDir + - "{sample}_R2_raw.fastq.gz_trimming_report.txt" + + tags: + - processed + + reads trimming report: + <<: *trimmedForwardReadsFastQCTrimmingReport + processed location: + - *trimDataDir + - *trimmingReportsDir + - "{sample}_raw.fastq.gz_trimming_report.txt" + + tags: + - processed + + aligned MultiQC directory: + processed location: + - *alignDataDir + - "align_multiqc_report" + + resource categories: *neverPublished + + tags: + - processed + + aligned MultiQC directory ZIP: + processed location: + - *alignDataDir + - "align_multiqc_report.zip" + + tags: + - processed + + resource categories: &AlignedSequenceData_MultiQCReports + subcategory: Aligned Sequence Data # RENAME: from 'Aligned sequence data'. For consistency with Title casing across the board + subdirectory: MultiQC Reports # RENAME: from 'MultiQC Reports'. For consistency with Title casing across the board + publish to repo: true + include subdirectory in table: true + table order: 8 + + aligned ToTranscriptome Bam: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Aligned.toTranscriptome.out.bam" + + tags: + - processed + + resource categories: &AlignedSequenceData_AlignedData + subcategory: Aligned Sequence Data + subdirectory: Aligned Data + publish to repo: true + include subdirectory in table: false + table order: 6 + + aligned SortedByCoord Bam: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Aligned.sortedByCoord.out.bam" + + tags: + - processed + + resource categories: *neverPublished + + aligned SortedByCoord ResortedBam: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Aligned.sortedByCoord_sorted.out.bam" + + tags: + - processed + + resource categories: *AlignedSequenceData_AlignedData + + aligned SortedByCoord ResortedBamIndex: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Aligned.sortedByCoord_sorted.out.bam.bai" + + tags: + - processed + + resource categories: *AlignedSequenceData_AlignedData + + aligned log Final: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Log.final.out" + + tags: + - processed + + resource categories: &AlignedSequenceData_AlignmentLogs + subcategory: Aligned Sequence Data + subdirectory: Alignment Logs + publish to repo: true + include subdirectory in table: true + table order: 7 + + aligned log Progress: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Log.progress.out" + + tags: + - processed + + resource categories: *neverPublished + + aligned log Full: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Log.out" + + tags: + - processed + + resource categories: *neverPublished + + aligned sjTab: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_SJ.out.tab" + + tags: + - processed + + resource categories: *AlignedSequenceData_AlignedData + + genebody coverage MultiQC directory: + processed location: + - *rseqcDataDir + - "02_geneBody_coverage" + - "geneBody_cov_multiqc_report" + + tags: + - processed + + resource categories: *neverPublished + + genebody coverage MultiQC directory ZIP: + processed location: + - *rseqcDataDir + - "02_geneBody_coverage" + - "geneBody_cov_multiqc_report.zip" + + tags: + - processed + + resource categories: &RSeQC_MultiQCReports + subcategory: RSeQC + subdirectory: MultiQC Reports + publish to repo: true + include subdirectory in table: true + table order: 9 + + infer experiment MultiQC directory: + processed location: + - *rseqcDataDir + - "03_infer_experiment" + - "infer_exp_multiqc_report" + + tags: + - processed + + resource categories: *neverPublished + + infer experiment MultiQC directory ZIP: + processed location: + - *rseqcDataDir + - "03_infer_experiment" + - "infer_exp_multiqc_report.zip" + + tags: + - processed + + resource categories: *RSeQC_MultiQCReports + + inner distance MultiQC directory: + processed location: + - *rseqcDataDir + - "04_inner_distance" + - "inner_dist_multiqc_report" + + tags: + - processed + + resource categories: *neverPublished + + inner distance MultiQC directory ZIP: + processed location: + - *rseqcDataDir + - "04_inner_distance" + - "inner_dist_multiqc_report.zip" + + tags: + - processed + + resource categories: *RSeQC_MultiQCReports + + read distribution MultiQC directory: + processed location: + - *rseqcDataDir + - "05_read_distribution" + - "read_dist_multiqc_report" + + tags: + - processed + + resource categories: *neverPublished + + read distribution MultiQC directory ZIP: + processed location: + - *rseqcDataDir + - "05_read_distribution" + - "read_dist_multiqc_report.zip" + + tags: + - processed + + resource categories: *RSeQC_MultiQCReports + + genebody coverage out: + processed location: + - *rseqcDataDir + - "02_geneBody_coverage" + - "{sample}" + + tags: + - processed + + # TODO: DISCUSS Consider this for directories that are handled the same but should validate contents + # is directory: true + # contents: + # - ["{sample}.geneBodyCoverage.r"] + # - ["{sample}.geneBodyCoverage.txt"] + # - ["{sample}.geneBodyCoverage.curves.pdf"] + + resource categories: *neverPublished + + infer experiment out: + processed location: + - *rseqcDataDir + - "03_infer_experiment" + - "{sample}_infer_expt.out" + + tags: + - processed + + resource categories: *neverPublished + + inner distance out: + processed location: + - *rseqcDataDir + - "04_inner_distance" + - "{sample}" + + tags: + - processed + + resource categories: *neverPublished + + read distribution out: + processed location: + - *rseqcDataDir + - "05_read_distribution" + - "{sample}_read_dist.out" + + tags: + - processed + + resource categories: *neverPublished + + RSEM counts MultiQC directory: + processed location: + - *countsDataDir + - "RSEM_count_multiqc_report" # RENAMED from count_multiqc_report as of 4/14/2022 + + tags: + - processed + + resource categories: *neverPublished + + RSEM counts MultiQC directory ZIP: + processed location: + - *countsDataDir + - "RSEM_count_multiqc_report.zip" + + tags: + - processed + + resource categories: &RawCountsData_MultiQCReports + subcategory: Raw Counts Data + subdirectory: Multiqc Reports + publish to repo: true + include subdirectory in table: true + table order: 11 + + star number non-zero count genes table: + processed location: + - *alignDataDir + - "STAR_NumNonZeroGenes.csv" + + tags: + - processed + + resource categories: *neverPublished + + star unnormalized counts table: + processed location: + - *alignDataDir + - "STAR_Unnormalized_Counts.csv" + + tags: + - processed + + resource categories: &RawCountsTables + subcategory: Raw Counts Tables + subdirectory: "" + publish to repo: true + include subdirectory in table: false + table order: 12 + + rsem number non-zero count genes table: + processed location: + - *countsDataDir + - "RSEM_NumNonZeroGenes.csv" + + tags: + - processed + + resource categories: *neverPublished + + rsem unnormalized counts table: + processed location: + - *countsDataDir + - "RSEM_Unnormalized_Counts.csv" # RENAMED from 'Unnormalized_Counts.csv' + + tags: + - processed + + resource categories: *RawCountsTables + + sample reads per gene table: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_ReadsPerGene.out.tab" + + tags: + - processed + + resource categories: *neverPublished # TODO: Discuss, should this be repo published? In what way? + + sample gene counts table: + processed location: + - *countsDataDir + # Removed - "{sample}", DISCUSS: Since this directory contains multiple files per sample, should this be nested in sample-wise dirs consistent with STAR and RSeQC. J.O. + - "{sample}.genes.results" + + tags: + - processed + + resource categories: &RawCountsData_CountData + subcategory: Raw Counts Data + subdirectory: Count Data + publish to repo: true + include subdirectory in table: false + table order: 10 + + sample isoform counts table: + processed location: + - *countsDataDir + # Removed - "{sample}", DISCUSS: Since this directory contains multiple files per sample, should this be nested in sample-wise dirs consistent with STAR and RSeQC. J.O. + - "{sample}.isoforms.results" + + tags: + - processed + + resource categories: *RawCountsData_CountData + + sample counts stats directory: + processed location: + - *countsDataDir + # Removed - "{sample}", DISCUSS: Since this directory contains multiple files per sample, should this be nested in sample-wise dirs consistent with STAR and RSeQC. J.O. + - "{sample}.stat" + + tags: + - processed + + resource categories: *neverPublished + + DESeq2 normalized counts table: + processed location: + - *normCountsDataDir + - "Normalized_Counts.csv" + + tags: + - processed + + resource categories: &normalizedCountsData + subcategory: Normalized Counts Data + subdirectory: "" + publish to repo: true + include subdirectory in table: false + table order: 13 + + ERCC normalized DESeq2 normalized counts table: + processed location: + - *normCountsDataDir + - "ERCC_Normalized_Counts.csv" + + tags: + - processed + + resource categories: *normalizedCountsData + + sample table: + processed location: + - *DGEDataDir + - "SampleTable.csv" + + tags: + - processed + + resource categories: &DGEAnalysisData + subcategory: Differential Expression Analysis Data + subdirectory: "" + publish to repo: true + include subdirectory in table: false + table order: 14 + + ERCC sample table: + processed location: + - *DGEDataDir + - &erccSubDir "ERCC_NormDGE" + - "ERCCnorm_SampleTable.csv" + + tags: + - processed + + resource categories: *DGEAnalysisData + + DESeq2 unnormalized counts table: + processed location: + - *normCountsDataDir + - "RSEM_Unnormalized_Counts.csv" # RENAMED: from "Unnormalized_Counts.csv" for clarity + + tags: + - processed + + resource categories: *neverPublished # DISCUSS: temporary name clash resolution for publishables + + DESeq2 contrasts table: + processed location: + - *DGEDataDir + - "contrasts.csv" + + tags: + - processed + + resource categories: *DGEAnalysisData + + ERCC normalized DESeq2 contrasts table: + processed location: + - *DGEDataDir + - *erccSubDir + - "ERCCnorm_contrasts.csv" + + tags: + - processed + + resource categories: *DGEAnalysisData + + DESeq2 annotated DGE table: + processed location: + - *DGEDataDir + - "differential_expression.csv" + + tags: + - processed + + resource categories: *DGEAnalysisData + + ERCC normalized DESeq2 annotated DGE table: + processed location: + - *DGEDataDir + - *erccSubDir + - "ERCCnorm_differential_expression.csv" + + tags: + - processed + + resource categories: *DGEAnalysisData + + DESeq2 annotated DGE extended for viz table: + processed location: + - *DGEDataDir + - "visualization_output_table.csv" + + tags: + - processed + + resource categories: *neverPublished + + ERCC normalized DESeq2 annotated DGE extended for viz table: + processed location: + - *DGEDataDir + - *erccSubDir + - "visualization_output_table_ERCCnorm.csv" + + tags: + - processed + + resource categories: *neverPublished + + DESeq2 viz PCA table: + processed location: + - *DGEDataDir + - "visualization_PCA_table.csv" + + tags: + - processed + + resource categories: *neverPublished + + ERCC normalized DESeq2 viz PCA table: + processed location: + - *DGEDataDir + - *erccSubDir + - "visualization_PCA_table_ERCCnorm.csv" + + tags: + - processed + + resource categories: *neverPublished + + + ERCC analysis HTML: + processed location: + - *ERCCAnalysisDir + - "ERCC_analysis.html" + + tags: + - processed + + conditional on dataset: + - has_ERCC: [True] + + resource categories: + subcategory: ERCC Analyses + subdirectory: "" + publish to repo: true + include subdirectory in table: false + table order: 15 + + # NOTE: this is while the ERCC analysis sits outside the full pipeline and + # once incoporated, it should be validated for existence! + validate exists: false + +# Assets that are no longer generated by the latest pipeline +Archived Data Assets: + + # DISCUSS: When Trim Galore MQC if made clearer, publishing this should be revisited + # Currently this only reports the direct cutadapt related trimming and misses Trim-Galore + # Specific metrics. + # - Jonathan Oribello + trimming MultiQC directory ZIP: + processed location: + - *trimDataDir + - *trimmingReportsDir + - "trimming_multiqc_report.zip" + + tags: + - processed + + resource categories: *neverPublished + + +data asset sets: + # These assets are not generated in the workflow, but are generated after the workflow + PUTATIVE: + - "ERCC analysis HTML" + glds metadata: + - "ISA Archive" + has ercc: + - "ERCC normalized DESeq2 normalized counts table" + - "ERCC sample table" + - "ERCC normalized DESeq2 contrasts table" + - "ERCC normalized DESeq2 annotated DGE table" + - "ERCC normalized DESeq2 annotated DGE extended for viz table" + - "ERCC normalized DESeq2 viz PCA table" + # NOTE: Not part of NF_WF yet - "ERCC analysis HTML" + demuliplexed paired end raw data: + - "runsheet" + - "raw forward reads fastq GZ" + - "raw reverse reads fastq GZ" + qc reports for paired end raw data: + - "raw forward reads fastQC HTML" + - "raw reverse reads fastQC HTML" + - "raw forward reads fastQC ZIP" + - "raw reverse reads fastQC ZIP" + - "raw MultiQC directory" + - "raw MultiQC directory ZIP" + paired end trimmed reads: + - "trimmed forward reads fastq GZ" + - "trimmed reverse reads fastq GZ" + qc reports for paired end trimmed reads data: + - "trimmed forward reads fastQC HTML" + - "trimmed reverse reads fastQC HTML" + - "trimmed forward reads fastQC ZIP" + - "trimmed reverse reads fastQC ZIP" + - "trimmed fastQC MultiQC directory" + - "trimming MultiQC directory" + - "forward reads trimming report" + - "reverse reads trimming report" + demuliplexed single end raw data: + - "runsheet" + - "raw reads fastq GZ" + qc reports for single end raw data: + - "raw reads fastQC HTML" + - "raw reads fastQC ZIP" + - "raw MultiQC directory" + - "raw MultiQC directory ZIP" + single end trimmed reads: + - "trimmed reads fastq GZ" + qc reports for single end trimmed reads data: + - "trimmed reads fastQC HTML" + - "trimmed reads fastQC ZIP" + - "trimmed fastQC MultiQC directory" + - "trimming MultiQC directory" + - "reads trimming report" + STAR alignments: + - "aligned MultiQC directory" + - "aligned MultiQC directory ZIP" + - "aligned ToTranscriptome Bam" + - "aligned SortedByCoord Bam" + - "aligned SortedByCoord ResortedBam" + - "aligned SortedByCoord ResortedBamIndex" + - "aligned log Final" + - "aligned log Progress" + - "aligned log Full" + - "aligned sjTab" + - "sample reads per gene table" + - "star number non-zero count genes table" + - "star unnormalized counts table" + RSeQC output for paired end data: + - "genebody coverage MultiQC directory" + - "genebody coverage MultiQC directory ZIP" + - "infer experiment MultiQC directory" + - "infer experiment MultiQC directory ZIP" + - "inner distance MultiQC directory" + - "inner distance MultiQC directory ZIP" + - "read distribution MultiQC directory" + - "read distribution MultiQC directory ZIP" + - "genebody coverage out" + - "infer experiment out" + - "inner distance out" + - "read distribution out" + RSeQC output for single end data: + - "genebody coverage MultiQC directory" + - "genebody coverage MultiQC directory ZIP" + - "infer experiment MultiQC directory" + - "infer experiment MultiQC directory ZIP" + - "read distribution MultiQC directory" + - "read distribution MultiQC directory ZIP" + - "genebody coverage out" + - "infer experiment out" + - "read distribution out" + RSEM counts: + - "RSEM counts MultiQC directory" + - "RSEM counts MultiQC directory ZIP" + - "rsem number non-zero count genes table" + - "rsem unnormalized counts table" + - "sample gene counts table" + - "sample isoform counts table" + - "sample counts stats directory" + is single end full: + - "runsheet" + - "ISA Archive" + - "raw MultiQC directory" + - "raw MultiQC directory ZIP" + - "raw reads fastq GZ" + - "raw reads fastQC HTML" + - "raw reads fastQC ZIP" + - "trimmed fastQC MultiQC directory" + - "trimmed fastQC MultiQC directory ZIP" + - "trimmed reads fastq GZ" + - "trimmed reads fastQC HTML" + - "trimmed reads fastQC ZIP" + - "trimming MultiQC directory" + - "reads trimming report" + - "aligned MultiQC directory" + - "aligned MultiQC directory ZIP" + - "aligned ToTranscriptome Bam" + - "aligned SortedByCoord Bam" + - "aligned SortedByCoord ResortedBam" + - "aligned SortedByCoord ResortedBamIndex" + - "aligned log Final" + - "aligned log Progress" + - "aligned log Full" + - "aligned sjTab" + - "genebody coverage MultiQC directory" + - "genebody coverage MultiQC directory ZIP" + - "infer experiment MultiQC directory" + - "infer experiment MultiQC directory ZIP" + - "read distribution MultiQC directory" + - "read distribution MultiQC directory ZIP" + - "genebody coverage out" + - "infer experiment out" + - "read distribution out" + - "RSEM counts MultiQC directory" + - "RSEM counts MultiQC directory ZIP" + - "star number non-zero count genes table" + - "star unnormalized counts table" + - "rsem number non-zero count genes table" + - "rsem unnormalized counts table" + - "sample reads per gene table" + - "sample gene counts table" + - "sample isoform counts table" + - "sample counts stats directory" + - "DESeq2 normalized counts table" + - "sample table" + - "DESeq2 unnormalized counts table" + - "DESeq2 contrasts table" + - "DESeq2 annotated DGE table" + - "DESeq2 annotated DGE extended for viz table" + - "DESeq2 viz PCA table" + is paired end full: + - "runsheet" + - "ISA Archive" + - "raw MultiQC directory" + - "raw MultiQC directory ZIP" + - "raw forward reads fastq GZ" + - "raw reverse reads fastq GZ" + - "raw forward reads fastQC HTML" + - "raw reverse reads fastQC HTML" + - "raw forward reads fastQC ZIP" + - "raw reverse reads fastQC ZIP" + - "trimmed fastQC MultiQC directory" + - "trimmed fastQC MultiQC directory ZIP" + - "trimmed forward reads fastq GZ" + - "trimmed reverse reads fastq GZ" + - "trimmed forward reads fastQC HTML" + - "trimmed reverse reads fastQC HTML" + - "trimmed forward reads fastQC ZIP" + - "trimmed reverse reads fastQC ZIP" + - "trimming MultiQC directory" + - "forward reads trimming report" + - "reverse reads trimming report" + - "aligned MultiQC directory" + - "aligned MultiQC directory ZIP" + - "aligned ToTranscriptome Bam" + - "aligned SortedByCoord Bam" + - "aligned SortedByCoord ResortedBam" + - "aligned SortedByCoord ResortedBamIndex" + - "aligned log Final" + - "aligned log Progress" + - "aligned log Full" + - "aligned sjTab" + - "genebody coverage MultiQC directory" + - "genebody coverage MultiQC directory ZIP" + - "infer experiment MultiQC directory" + - "infer experiment MultiQC directory ZIP" + - "inner distance MultiQC directory" + - "inner distance MultiQC directory ZIP" + - "read distribution MultiQC directory" + - "read distribution MultiQC directory ZIP" + - "genebody coverage out" + - "infer experiment out" + - "inner distance out" + - "read distribution out" + - "RSEM counts MultiQC directory" + - "RSEM counts MultiQC directory ZIP" + - "star number non-zero count genes table" + - "star unnormalized counts table" + - "rsem number non-zero count genes table" + - "rsem unnormalized counts table" + - "sample reads per gene table" + - "sample gene counts table" + - "sample isoform counts table" + - "sample counts stats directory" + - "DESeq2 normalized counts table" + - "sample table" + - "DESeq2 unnormalized counts table" + - "DESeq2 contrasts table" + - "DESeq2 annotated DGE table" + - "DESeq2 annotated DGE extended for viz table" + - "DESeq2 viz PCA table" + DGE Output: + - "DESeq2 normalized counts table" + - "sample table" + - "DESeq2 unnormalized counts table" + - "DESeq2 contrasts table" + - "DESeq2 annotated DGE table" + - "DESeq2 annotated DGE extended for viz table" + - "DESeq2 viz PCA table" + ERCC DGE Output: + - "ERCC normalized DESeq2 normalized counts table" + - "ERCC sample table" + - "ERCC normalized DESeq2 contrasts table" + - "ERCC normalized DESeq2 annotated DGE table" + - "ERCC normalized DESeq2 annotated DGE extended for viz table" + - "ERCC normalized DESeq2 viz PCA table" + # NOTE: Not part of NF_WF yet - "ERCC analysis HTML" + RSEM Output: + - "RSEM counts MultiQC directory" + - "RSEM counts MultiQC directory ZIP" + - "rsem number non-zero count genes table" + - "rsem unnormalized counts table" \ No newline at end of file diff --git a/assets/extraction_conf.yaml b/assets/extraction_conf.yaml new file mode 100644 index 0000000..d4d7b5f --- /dev/null +++ b/assets/extraction_conf.yaml @@ -0,0 +1,49 @@ +Extraction Settings: + root search directory: "/CHANGEME/TO/WHERE/MQC/ARE" + sections: + - name: "raw reads" + enabled: True + multiQC: + from json: + - "raw_multiqc_report" + - "raw_multiqc_data" + - "multiqc_data.json" + search recursively: True + logs directory: + - "00-RawData" + - "FastQC_Reports" + logs pattern(s): + - "*fastqc.zip" + modules: + - "fastqc" + + - name: "trimmed reads" + enabled: True + multiQC: + from json: + - "trimmed_multiqc_report" + - "trimmed_multiqc_data" + - "multiqc_data.json" + search recursively: False + logs directory: + - "01-TG_Preproc" + - "FastQC_Reports" + logs pattern(s): + - "*fastqc.zip" + modules: + - "fastqc" + + - name: "aligned reads" + enabled: True + multiQC: + from json: + - "align_multiqc_report" + - "align_multiqc_data" + - "multiqc_data.json" + search recursively: True + logs directory: + - "02-STAR_Alignment" + logs pattern(s): + - "*Log.final.out" + modules: + - "star" diff --git a/assets/isa_config.yaml b/assets/isa_config.yaml new file mode 100644 index 0000000..bb7ebb9 --- /dev/null +++ b/assets/isa_config.yaml @@ -0,0 +1,1325 @@ +# TOP LEVEL +NAME: "bulkRNASeq" +VERSION: "1" + +# anchors for reuse +_anchors: + rawDataDir: &rawDataDir "00-RawData" + trimDataDir: &trimDataDir "01-TG_Preproc" + alignDataDir: &alignDataDir "02-STAR_Alignment" + countsDataDir: &countsDataDir "03-RSEM_Counts" + normCountsDataDir: &normCountsDataDir "04-DESeq2_NormCounts" + DGEDataDir: &DGEDataDir "05-DESeq2_DGE" + rseqcDataDir: &rseqcDataDir "RSeQC_Analyses" # DISCUSS: Should this be renamed to "RSeQC_Analyses" for consistent casing? -J.O. , this has been renamed and differs from the recent bash based processings + ERCCAnalysisDir: &ERCCAnalysisDir "ERCC_Analysis" + FastQC_Reports: &FastQC_Reports "FastQC_Reports" + neverPublished: &neverPublished + subcategory: null + subdirectory: null + publish to repo: false + include subdirectory in table: false + table order: -1 + +Staging: + General: + Required Metadata: + From ISA: + - ISA Field Name: Study Protocol Type + ISA Table Source: Investigation + Investigation Subtable: STUDY PROTOCOLS + # will return a boolean indicating if any of the following includes + True If Includes At Least One: + - spike-in quality control role + - spike-in protocol + - spike-in control + - spike-in control protocol + Runsheet Column Name: has_ERCC + Processing Usage: >- + Indicates is ERCC spike-in has been added. This can be automatically + determined from the ISA archive as well based on 'Study Protocol Name' and 'Study Protocol Type' + Example: 'TRUE' + + - ISA Field Name: + - Characteristics[Organism] + - Characteristics[organism] + ISA Table Source: Sample + Runsheet Column Name: organism + Processing Usage: >- + Mapping to the appropriate alignment reference and annotation databases. + Example: Arabidopsis thaliana + + - ISA Field Name: + - Characteristics[Material Type] + # - Characteristics[organism] + ISA Table Source: Assay + Runsheet Column Name: Tissue Type + Processing Usage: >- + Used to metrics table + Example: Left retina + Fallback Value: "NOT FOUND IN ISA COLUMNS" + + - ISA Field Name: + - Parameter Value[library selection] + - Parameter Value[library Selection] # Alternative casing found first in OSD-120 + # - Characteristics[organism] + ISA Table Source: Sample + Runsheet Column Name: Library Prep Method + Processing Usage: >- + Used to metrics table + Example: Left retina + Fallback Value: "NOT FOUND IN ISA COLUMNS" + + - ISA Field Name: Sample Name + ISA Table Source: Assay + Runsheet Column Name: sample_name + Runsheet Index: true + Processing Usage: >- + Sample name is used as a unique sample identifier during processing + Example: Atha_Col-0_Root_WT_Ctrl_45min_Rep1_GSM502538 + + - ISA Field Name: + - Parameter Value[library layout] + - Parameter Value[Library Layout] + ISA Table Source: Assay + Runsheet Column Name: PE or SE + Remapping: {"PAIRED":'PE', "Paired":'PE', "SINGLE":'SE'} + Processing Usage: >- + Used to metrics table + Example: 'PE' + + - ISA Field Name: + - Parameter Value[Stranded] + - Parameter Value[stranded] + ISA Table Source: Assay + Runsheet Column Name: Stranded or Unstranded + # Remapping: {"PAIRED":'PE', "Paired":'PE', "SINGLE":'SE'} + Processing Usage: >- + Used to metrics table + Example: 'STRANDED' + Fallback Value: "NOT FOUND IN ISA COLUMNS" + + - ISA Field Name: + - Parameter Value[rRNA Contamination] + # - Parameter Value[Library Layout] + ISA Table Source: Assay + Runsheet Column Name: '% rRNA contamination' + # Append Column Following: "Unit" + Processing Usage: >- + Used to metrics table + Example: '13.212 percent' + + # this entry denotes the following: + # retrive from that ISA field name + # multiple values (separated by ",") + # index those to certain runsheet columns + # if the index doesn't exist, optional prevents raising an exception + # GLDS URL Mapping means the names are searched against the GLDS filelisting json for urls + # an exception will be raised if one and only one url is not mapped to each filename + # - ISA Field Name: + # - Parameter Value[Merged Sequence Data File] + # - Characteristics[Merged Sequence Data File] + # - Raw Data File + # ISA Table Source: Assay + # Multiple Values Per Entry: true + # Multiple Values Delimiter: '\s*,\s*' # whitespace surrounded comma + # Runsheet Column Name: + # - {'name':'read1_path', 'index':0} + # - {'name':'read2_path', 'index':1, 'optional':true} + # GLDS URL Mapping: true + # Processing Usage: >- + # Location to the raw data fastq file. May be a url or local path. + # Example: 'https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-194_rna...' + + # - ISA Field Name: Factor Value[{factor_name}] + # ISA Table Source: [Assay, Sample] + # Runsheet Column Name: Factor Value[{factor_name}] + # Matches Multiple Columns: true + # Match Regex: "Factor Value\\[.*\\]" + # Append Column Following: "Unit" + # Processing Usage: >- + # Factor values in a study. Used to assign experimental groups for each sample. + # Note: On the runsheet, a subsequent 'Unit' Column value will be + # suffix-concatenated if it exists. + # Example: Basal Control + + - ISA Field Name: Unit + ISA Table Source: [Assay, Sample] + Runsheet Column Name: null + Matches Multiple Columns: true + Autoload: false # handled by factor value loading above + Processing Usage: >- + Unit to be suffix-concatenated onto prior Factor value columns. + Example: day + + From User: + # Removed since unused by Processing via the runsheet + # - Runsheet Column Name: GLDS + # Processing Usage: >- + # The GLDS accession number + # Example: GLDS-205 + + - Runsheet Column Name: read1_path + # used to generate candidate file names for searching GLDS repository filelisting + Data Asset Keys: ["raw forward reads fastq GZ", "raw reads fastq GZ"] + Processing Usage: >- + The location of either the forward reads (paired end) or only reads file (single end) + raw fastq file. Can be either a url or local path. + Note: For GLDS raw data assets, either the filelisting json API or the OpenAPI + may be used to retrieve urls given the array data filename (sourced from ISA archive). + Example: /some/local/path OR https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-123_microarray_E-MTAB-3289.raw.1.zip?version=1 + + + - Runsheet Column Name: read2_path + Data Asset Keys: ["raw reverse reads fastq GZ"] + Processing Usage: >- + The location of either the reverse reads (paired end) + raw fastq file. Can be either a url or local path. + For single end studies, this should be an empty string. + Note: For GLDS raw data assets, either the filelisting json API or the OpenAPI + may be used to retrieve urls given the array data filename (sourced from ISA archive). + Example: /some/local/path OR https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-123_microarray_E-MTAB-3289.raw.1.zip?version=1 + +ISA Meta: + Valid Study Assay Technology And Measurement Types: + - measurement: "transcription profiling" + technology: "RNA Sequencing (RNA-Seq)" + + # this is prepended to all file names in the curation assay table + Global file prefix: "{datasystem}_rna_seq_" + + # configuration related to updating investigation file + # each must refer to a STUDY PROCESS in the 'ISA_investigation.yaml' file + # LEADCAP_organism should be the studied organisms scientific name with a leading cap + Post Processing Add Study Protocol: + GeneLab RNAseq data processing protocol::{LEADCAP_organism} V1 + +data assets: + runsheet: + processed location: + - "Metadata" + - "{dataset}_bulkRNASeq_v1_runsheet.csv" + + tags: + - raw + + resource categories: *neverPublished + + ISA Archive: + processed location: + - "Metadata" + - "*-ISA.zip" + + tags: + - raw + + resource categories: *neverPublished + + raw MultiQC directory: + processed location: + - *rawDataDir + - *FastQC_Reports + - "raw_multiqc_report" + + tags: + - raw + + resource categories: *neverPublished + + raw MultiQC directory ZIP: + processed location: + - *rawDataDir + - *FastQC_Reports + - "raw_multiqc_report.zip" + + tags: + - raw + + resource categories: &MergedSequenceData_MultiQCReports + subcategory: Merged Sequence Data + subdirectory: Multiqc Reports + publish to repo: true + include subdirectory in table: true + table order: 1 + + raw forward reads fastq GZ: + processed location: + - *rawDataDir + - "Fastq" + - "{sample}_R1_raw.fastq.gz" + + tags: + - raw + + resource categories: &MergedSequenceData_Fastq + subcategory: Merged Sequence Data + subdirectory: Fastq + publish to repo: true + include subdirectory in table: false + table order: 0 + + raw reverse reads fastq GZ: + processed location: + - *rawDataDir + - "Fastq" + - "{sample}_R2_raw.fastq.gz" + + tags: + - raw + + resource categories: *MergedSequenceData_Fastq + + raw reads fastq GZ: + processed location: + - *rawDataDir + - "Fastq" + - "{sample}_raw.fastq.gz" + + tags: + - raw + + resource categories: *MergedSequenceData_Fastq + + raw forward reads fastQC HTML: + processed location: + - *rawDataDir + - *FastQC_Reports + - "{sample}_R1_raw_fastqc.html" + + tags: + - raw + + resource categories: *neverPublished + + # J.Oribello: We should revisit this, fastQC includes some unique (not parsed + # into multiQC) relevant information like the actual overrepresented sequence strings + raw reverse reads fastQC HTML: + processed location: + - *rawDataDir + - *FastQC_Reports + - "{sample}_R2_raw_fastqc.html" + + tags: + - raw + + resource categories: *neverPublished + + raw reads fastQC HTML: + processed location: + - *rawDataDir + - *FastQC_Reports + - "{sample}_raw_fastqc.html" + + tags: + - raw + + resource categories: *neverPublished + + raw forward reads fastQC ZIP: + processed location: + - *rawDataDir + - *FastQC_Reports + - "{sample}_R1_raw_fastqc.zip" + + tags: + - raw + + resource categories: *neverPublished + + raw reverse reads fastQC ZIP: + processed location: + - *rawDataDir + - *FastQC_Reports + - "{sample}_R2_raw_fastqc.zip" + + tags: + - raw + + resource categories: *neverPublished + + raw reads fastQC ZIP: + processed location: + - *rawDataDir + - *FastQC_Reports + - "{sample}_raw_fastqc.zip" + + tags: + - raw + + resource categories: *neverPublished + + trimmed fastQC MultiQC directory: + processed location: + - *trimDataDir + - *FastQC_Reports + - "trimmed_multiqc_report" + + tags: + - processed + + resource categories: *neverPublished + + trimmed fastQC MultiQC directory ZIP: + processed location: + - *trimDataDir + - *FastQC_Reports + - "trimmed_multiqc_report.zip" + + tags: + - processed + + resource categories: &TrimmedSequenceData_MultiQCReports + subcategory: Trimmed Sequence Data + subdirectory: Multiqc Reports + publish to repo: true + include subdirectory in table: true + table order: 4 + + trimmed forward reads fastq GZ: &trimmedFastqGZ + processed location: + - *trimDataDir + - "Fastq" + - "{sample}_R1_trimmed.fastq.gz" + + tags: + - processed + + resource categories: + subcategory: Trimmed Sequence Data + subdirectory: Fastq + publish to repo: true + include subdirectory in table: false + table order: 3 + + trimmed reverse reads fastq GZ: + <<: *trimmedFastqGZ + processed location: + - *trimDataDir + - "Fastq" + - "{sample}_R2_trimmed.fastq.gz" + + tags: + - processed + + trimmed reads fastq GZ: + <<: *trimmedFastqGZ + processed location: + - *trimDataDir + - "Fastq" + - "{sample}_trimmed.fastq.gz" + + tags: + - processed + + trimmed forward reads fastQC HTML: &trimmedForwardReadsFastQCHTML + processed location: + - *trimDataDir + - *FastQC_Reports + - "{sample}_R1_trimmed_fastqc.html" + + tags: + - processed + + resource categories: *neverPublished + + trimmed reverse reads fastQC HTML: + <<: *trimmedForwardReadsFastQCHTML + processed location: + - *trimDataDir + - *FastQC_Reports + - "{sample}_R2_trimmed_fastqc.html" + + tags: + - processed + + trimmed reads fastQC HTML: + <<: *trimmedForwardReadsFastQCHTML + processed location: + - *trimDataDir + - *FastQC_Reports + - "{sample}_trimmed_fastqc.html" + + tags: + - processed + + trimmed forward reads fastQC ZIP: &trimmedForwardReadsFastQCZIP + processed location: + - *trimDataDir + - *FastQC_Reports + - "{sample}_R1_trimmed_fastqc.zip" + + tags: + - processed + + resource categories: *neverPublished + + trimmed reverse reads fastQC ZIP: + <<: *trimmedForwardReadsFastQCZIP + processed location: + - *trimDataDir + - *FastQC_Reports + - "{sample}_R2_trimmed_fastqc.zip" + + tags: + - processed + + trimmed reads fastQC ZIP: + <<: *trimmedForwardReadsFastQCZIP + processed location: + - *trimDataDir + - *FastQC_Reports + - "{sample}_trimmed_fastqc.zip" + + tags: + - processed + + trimming MultiQC directory: + processed location: + - *trimDataDir + - &trimmingReportsDir "Trimming_Reports" + - "trimming_multiqc_report" + + tags: + - processed + + resource categories: *neverPublished + + forward reads trimming report: &trimmedForwardReadsFastQCTrimmingReport + processed location: + - *trimDataDir + - *trimmingReportsDir + - "{sample}_R1_raw.fastq.gz_trimming_report.txt" + + tags: + - processed + + resource categories: + subcategory: Trimmed Sequence Data + subdirectory: Trimming Reports + publish to repo: true + include subdirectory in table: true + table order: 5 + + reverse reads trimming report: + <<: *trimmedForwardReadsFastQCTrimmingReport + processed location: + - *trimDataDir + - *trimmingReportsDir + - "{sample}_R2_raw.fastq.gz_trimming_report.txt" + + tags: + - processed + + reads trimming report: + <<: *trimmedForwardReadsFastQCTrimmingReport + processed location: + - *trimDataDir + - *trimmingReportsDir + - "{sample}_raw.fastq.gz_trimming_report.txt" + + tags: + - processed + + aligned MultiQC directory: + processed location: + - *alignDataDir + - "align_multiqc_report" + + resource categories: *neverPublished + + tags: + - processed + + aligned MultiQC directory ZIP: + processed location: + - *alignDataDir + - "align_multiqc_report.zip" + + tags: + - processed + + resource categories: &AlignedSequenceData_MultiQCReports + subcategory: Aligned Sequence Data # RENAME: from 'Aligned sequence data'. For consistency with Title casing across the board + subdirectory: MultiQC Reports # RENAME: from 'MultiQC Reports'. For consistency with Title casing across the board + publish to repo: true + include subdirectory in table: true + table order: 8 + + aligned ToTranscriptome Bam: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Aligned.toTranscriptome.out.bam" + + tags: + - processed + + resource categories: &AlignedSequenceData_AlignedData + subcategory: Aligned Sequence Data + subdirectory: Aligned Data + publish to repo: true + include subdirectory in table: false + table order: 6 + + aligned SortedByCoord Bam: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Aligned.sortedByCoord.out.bam" + + tags: + - processed + + resource categories: *neverPublished + + aligned SortedByCoord ResortedBam: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Aligned.sortedByCoord_sorted.out.bam" + + tags: + - processed + + resource categories: *AlignedSequenceData_AlignedData + + aligned SortedByCoord ResortedBamIndex: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Aligned.sortedByCoord_sorted.out.bam.bai" + + tags: + - processed + + resource categories: *AlignedSequenceData_AlignedData + + aligned log Final: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Log.final.out" + + tags: + - processed + + resource categories: &AlignedSequenceData_AlignmentLogs + subcategory: Aligned Sequence Data + subdirectory: Alignment Logs + publish to repo: true + include subdirectory in table: true + table order: 7 + + aligned log Progress: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Log.progress.out" + + tags: + - processed + + resource categories: *neverPublished + + aligned log Full: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Log.out" + + tags: + - processed + + resource categories: *neverPublished + + aligned sjTab: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_SJ.out.tab" + + tags: + - processed + + resource categories: *AlignedSequenceData_AlignedData + + genebody coverage MultiQC directory: + processed location: + - *rseqcDataDir + - "02_geneBody_coverage" + - "geneBody_cov_multiqc_report" + + tags: + - processed + + resource categories: *neverPublished + + genebody coverage MultiQC directory ZIP: + processed location: + - *rseqcDataDir + - "02_geneBody_coverage" + - "geneBody_cov_multiqc_report.zip" + + tags: + - processed + + resource categories: &RSeQC_MultiQCReports + subcategory: RSeQC + subdirectory: MultiQC Reports + publish to repo: true + include subdirectory in table: true + table order: 9 + + infer experiment MultiQC directory: + processed location: + - *rseqcDataDir + - "03_infer_experiment" + - "infer_exp_multiqc_report" + + tags: + - processed + + resource categories: *neverPublished + + infer experiment MultiQC directory ZIP: + processed location: + - *rseqcDataDir + - "03_infer_experiment" + - "infer_exp_multiqc_report.zip" + + tags: + - processed + + resource categories: *RSeQC_MultiQCReports + + inner distance MultiQC directory: + processed location: + - *rseqcDataDir + - "04_inner_distance" + - "inner_dist_multiqc_report" + + tags: + - processed + + resource categories: *neverPublished + + inner distance MultiQC directory ZIP: + processed location: + - *rseqcDataDir + - "04_inner_distance" + - "inner_dist_multiqc_report.zip" + + tags: + - processed + + resource categories: *RSeQC_MultiQCReports + + read distribution MultiQC directory: + processed location: + - *rseqcDataDir + - "05_read_distribution" + - "read_dist_multiqc_report" + + tags: + - processed + + resource categories: *neverPublished + + read distribution MultiQC directory ZIP: + processed location: + - *rseqcDataDir + - "05_read_distribution" + - "read_dist_multiqc_report.zip" + + tags: + - processed + + resource categories: *RSeQC_MultiQCReports + + genebody coverage out: + processed location: + - *rseqcDataDir + - "02_geneBody_coverage" + - "{sample}" + + tags: + - processed + + # TODO: DISCUSS Consider this for directories that are handled the same but should validate contents + # is directory: true + # contents: + # - ["{sample}.geneBodyCoverage.r"] + # - ["{sample}.geneBodyCoverage.txt"] + # - ["{sample}.geneBodyCoverage.curves.pdf"] + + resource categories: *neverPublished + + infer experiment out: + processed location: + - *rseqcDataDir + - "03_infer_experiment" + - "{sample}_infer_expt.out" + + tags: + - processed + + resource categories: *neverPublished + + inner distance out: + processed location: + - *rseqcDataDir + - "04_inner_distance" + - "{sample}" + + tags: + - processed + + resource categories: *neverPublished + + read distribution out: + processed location: + - *rseqcDataDir + - "05_read_distribution" + - "{sample}_read_dist.out" + + tags: + - processed + + resource categories: *neverPublished + + RSEM counts MultiQC directory: + processed location: + - *countsDataDir + - "RSEM_count_multiqc_report" # RENAMED from count_multiqc_report as of 4/14/2022 + + tags: + - processed + + resource categories: *neverPublished + + RSEM counts MultiQC directory ZIP: + processed location: + - *countsDataDir + - "RSEM_count_multiqc_report.zip" + + tags: + - processed + + resource categories: &RawCountsData_MultiQCReports + subcategory: Raw Counts Data + subdirectory: Multiqc Reports + publish to repo: true + include subdirectory in table: true + table order: 11 + + star number non-zero count genes table: + processed location: + - *alignDataDir + - "STAR_NumNonZeroGenes.csv" + + tags: + - processed + + resource categories: *neverPublished + + star unnormalized counts table: + processed location: + - *alignDataDir + - "STAR_Unnormalized_Counts.csv" + + tags: + - processed + + resource categories: &RawCountsTables + subcategory: Raw Counts Tables + subdirectory: "" + publish to repo: true + include subdirectory in table: false + table order: 12 + + rsem number non-zero count genes table: + processed location: + - *countsDataDir + - "RSEM_NumNonZeroGenes.csv" + + tags: + - processed + + resource categories: *neverPublished + + rsem unnormalized counts table: + processed location: + - *countsDataDir + - "RSEM_Unnormalized_Counts.csv" # RENAMED from 'Unnormalized_Counts.csv' + + tags: + - processed + + resource categories: *RawCountsTables + + sample reads per gene table: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_ReadsPerGene.out.tab" + + tags: + - processed + + resource categories: *neverPublished # TODO: Discuss, should this be repo published? In what way? + + sample gene counts table: + processed location: + - *countsDataDir + # Removed - "{sample}", DISCUSS: Since this directory contains multiple files per sample, should this be nested in sample-wise dirs consistent with STAR and RSeQC. J.O. + - "{sample}.genes.results" + + tags: + - processed + + resource categories: &RawCountsData_CountData + subcategory: Raw Counts Data + subdirectory: Count Data + publish to repo: true + include subdirectory in table: false + table order: 10 + + sample isoform counts table: + processed location: + - *countsDataDir + # Removed - "{sample}", DISCUSS: Since this directory contains multiple files per sample, should this be nested in sample-wise dirs consistent with STAR and RSeQC. J.O. + - "{sample}.isoforms.results" + + tags: + - processed + + resource categories: *RawCountsData_CountData + + sample counts stats directory: + processed location: + - *countsDataDir + # Removed - "{sample}", DISCUSS: Since this directory contains multiple files per sample, should this be nested in sample-wise dirs consistent with STAR and RSeQC. J.O. + - "{sample}.stat" + + tags: + - processed + + resource categories: *neverPublished + + DESeq2 normalized counts table: + processed location: + - *normCountsDataDir + - "Normalized_Counts.csv" + + tags: + - processed + + resource categories: &normalizedCountsData + subcategory: Normalized Counts Data + subdirectory: "" + publish to repo: true + include subdirectory in table: false + table order: 13 + + ERCC normalized DESeq2 normalized counts table: + processed location: + - *normCountsDataDir + - "ERCC_Normalized_Counts.csv" + + tags: + - processed + + resource categories: *normalizedCountsData + + sample table: + processed location: + - *DGEDataDir + - "SampleTable.csv" + + tags: + - processed + + resource categories: &DGEAnalysisData + subcategory: Differential Expression Analysis Data + subdirectory: "" + publish to repo: true + include subdirectory in table: false + table order: 14 + + ERCC sample table: + processed location: + - *DGEDataDir + - &erccSubDir "ERCC_NormDGE" + - "ERCCnorm_SampleTable.csv" + + tags: + - processed + + resource categories: *DGEAnalysisData + + DESeq2 unnormalized counts table: + processed location: + - *normCountsDataDir + - "RSEM_Unnormalized_Counts.csv" # RENAMED: from "Unnormalized_Counts.csv" for clarity + + tags: + - processed + + resource categories: *neverPublished # DISCUSS: temporary name clash resolution for publishables + + DESeq2 contrasts table: + processed location: + - *DGEDataDir + - "contrasts.csv" + + tags: + - processed + + resource categories: *DGEAnalysisData + + ERCC normalized DESeq2 contrasts table: + processed location: + - *DGEDataDir + - *erccSubDir + - "ERCCnorm_contrasts.csv" + + tags: + - processed + + resource categories: *DGEAnalysisData + + DESeq2 annotated DGE table: + processed location: + - *DGEDataDir + - "differential_expression.csv" + + tags: + - processed + + resource categories: *DGEAnalysisData + + ERCC normalized DESeq2 annotated DGE table: + processed location: + - *DGEDataDir + - *erccSubDir + - "ERCCnorm_differential_expression.csv" + + tags: + - processed + + resource categories: *DGEAnalysisData + + DESeq2 annotated DGE extended for viz table: + processed location: + - *DGEDataDir + - "visualization_output_table.csv" + + tags: + - processed + + resource categories: *neverPublished + + ERCC normalized DESeq2 annotated DGE extended for viz table: + processed location: + - *DGEDataDir + - *erccSubDir + - "visualization_output_table_ERCCnorm.csv" + + tags: + - processed + + resource categories: *neverPublished + + DESeq2 viz PCA table: + processed location: + - *DGEDataDir + - "visualization_PCA_table.csv" + + tags: + - processed + + resource categories: *neverPublished + + ERCC normalized DESeq2 viz PCA table: + processed location: + - *DGEDataDir + - *erccSubDir + - "visualization_PCA_table_ERCCnorm.csv" + + tags: + - processed + + resource categories: *neverPublished + + + ERCC analysis HTML: + processed location: + - *ERCCAnalysisDir + - "ERCC_analysis.html" + + tags: + - processed + + conditional on dataset: + - has_ERCC: [True] + + resource categories: + subcategory: ERCC Analyses + subdirectory: "" + publish to repo: true + include subdirectory in table: false + table order: 15 + + # NOTE: this is while the ERCC analysis sits outside the full pipeline and + # once incoporated, it should be validated for existence! + validate exists: false + +# Assets that are no longer generated by the latest pipeline +Archived Data Assets: + + # DISCUSS: When Trim Galore MQC if made clearer, publishing this should be revisited + # Currently this only reports the direct cutadapt related trimming and misses Trim-Galore + # Specific metrics. + # - Jonathan Oribello + trimming MultiQC directory ZIP: + processed location: + - *trimDataDir + - *trimmingReportsDir + - "trimming_multiqc_report.zip" + + tags: + - processed + + resource categories: *neverPublished + + +data asset sets: + # These assets are not generated in the workflow, but are generated after the workflow + PUTATIVE: + - "ERCC analysis HTML" + glds metadata: + - "ISA Archive" + has ercc: + - "ERCC normalized DESeq2 normalized counts table" + - "ERCC sample table" + - "ERCC normalized DESeq2 contrasts table" + - "ERCC normalized DESeq2 annotated DGE table" + - "ERCC normalized DESeq2 annotated DGE extended for viz table" + - "ERCC normalized DESeq2 viz PCA table" + # NOTE: Not part of NF_WF yet - "ERCC analysis HTML" + demuliplexed paired end raw data: + - "runsheet" + - "raw forward reads fastq GZ" + - "raw reverse reads fastq GZ" + qc reports for paired end raw data: + - "raw forward reads fastQC HTML" + - "raw reverse reads fastQC HTML" + - "raw forward reads fastQC ZIP" + - "raw reverse reads fastQC ZIP" + - "raw MultiQC directory" + - "raw MultiQC directory ZIP" + paired end trimmed reads: + - "trimmed forward reads fastq GZ" + - "trimmed reverse reads fastq GZ" + qc reports for paired end trimmed reads data: + - "trimmed forward reads fastQC HTML" + - "trimmed reverse reads fastQC HTML" + - "trimmed forward reads fastQC ZIP" + - "trimmed reverse reads fastQC ZIP" + - "trimmed fastQC MultiQC directory" + - "trimming MultiQC directory" + - "forward reads trimming report" + - "reverse reads trimming report" + demuliplexed single end raw data: + - "runsheet" + - "raw reads fastq GZ" + qc reports for single end raw data: + - "raw reads fastQC HTML" + - "raw reads fastQC ZIP" + - "raw MultiQC directory" + - "raw MultiQC directory ZIP" + single end trimmed reads: + - "trimmed reads fastq GZ" + qc reports for single end trimmed reads data: + - "trimmed reads fastQC HTML" + - "trimmed reads fastQC ZIP" + - "trimmed fastQC MultiQC directory" + - "trimming MultiQC directory" + - "reads trimming report" + STAR alignments: + - "aligned MultiQC directory" + - "aligned MultiQC directory ZIP" + - "aligned ToTranscriptome Bam" + - "aligned SortedByCoord Bam" + - "aligned SortedByCoord ResortedBam" + - "aligned SortedByCoord ResortedBamIndex" + - "aligned log Final" + - "aligned log Progress" + - "aligned log Full" + - "aligned sjTab" + - "sample reads per gene table" + - "star number non-zero count genes table" + - "star unnormalized counts table" + RSeQC output for paired end data: + - "genebody coverage MultiQC directory" + - "genebody coverage MultiQC directory ZIP" + - "infer experiment MultiQC directory" + - "infer experiment MultiQC directory ZIP" + - "inner distance MultiQC directory" + - "inner distance MultiQC directory ZIP" + - "read distribution MultiQC directory" + - "read distribution MultiQC directory ZIP" + - "genebody coverage out" + - "infer experiment out" + - "inner distance out" + - "read distribution out" + RSeQC output for single end data: + - "genebody coverage MultiQC directory" + - "genebody coverage MultiQC directory ZIP" + - "infer experiment MultiQC directory" + - "infer experiment MultiQC directory ZIP" + - "read distribution MultiQC directory" + - "read distribution MultiQC directory ZIP" + - "genebody coverage out" + - "infer experiment out" + - "read distribution out" + RSEM counts: + - "RSEM counts MultiQC directory" + - "RSEM counts MultiQC directory ZIP" + - "rsem number non-zero count genes table" + - "rsem unnormalized counts table" + - "sample gene counts table" + - "sample isoform counts table" + - "sample counts stats directory" + is single end full: + - "runsheet" + - "ISA Archive" + - "raw MultiQC directory" + - "raw MultiQC directory ZIP" + - "raw reads fastq GZ" + - "raw reads fastQC HTML" + - "raw reads fastQC ZIP" + - "trimmed fastQC MultiQC directory" + - "trimmed fastQC MultiQC directory ZIP" + - "trimmed reads fastq GZ" + - "trimmed reads fastQC HTML" + - "trimmed reads fastQC ZIP" + - "trimming MultiQC directory" + - "reads trimming report" + - "aligned MultiQC directory" + - "aligned MultiQC directory ZIP" + - "aligned ToTranscriptome Bam" + - "aligned SortedByCoord Bam" + - "aligned SortedByCoord ResortedBam" + - "aligned SortedByCoord ResortedBamIndex" + - "aligned log Final" + - "aligned log Progress" + - "aligned log Full" + - "aligned sjTab" + - "genebody coverage MultiQC directory" + - "genebody coverage MultiQC directory ZIP" + - "infer experiment MultiQC directory" + - "infer experiment MultiQC directory ZIP" + - "read distribution MultiQC directory" + - "read distribution MultiQC directory ZIP" + - "genebody coverage out" + - "infer experiment out" + - "read distribution out" + - "RSEM counts MultiQC directory" + - "RSEM counts MultiQC directory ZIP" + - "star number non-zero count genes table" + - "star unnormalized counts table" + - "rsem number non-zero count genes table" + - "rsem unnormalized counts table" + - "sample reads per gene table" + - "sample gene counts table" + - "sample isoform counts table" + - "sample counts stats directory" + - "DESeq2 normalized counts table" + - "sample table" + - "DESeq2 unnormalized counts table" + - "DESeq2 contrasts table" + - "DESeq2 annotated DGE table" + - "DESeq2 annotated DGE extended for viz table" + - "DESeq2 viz PCA table" + is paired end full: + - "runsheet" + - "ISA Archive" + - "raw MultiQC directory" + - "raw MultiQC directory ZIP" + - "raw forward reads fastq GZ" + - "raw reverse reads fastq GZ" + - "raw forward reads fastQC HTML" + - "raw reverse reads fastQC HTML" + - "raw forward reads fastQC ZIP" + - "raw reverse reads fastQC ZIP" + - "trimmed fastQC MultiQC directory" + - "trimmed fastQC MultiQC directory ZIP" + - "trimmed forward reads fastq GZ" + - "trimmed reverse reads fastq GZ" + - "trimmed forward reads fastQC HTML" + - "trimmed reverse reads fastQC HTML" + - "trimmed forward reads fastQC ZIP" + - "trimmed reverse reads fastQC ZIP" + - "trimming MultiQC directory" + - "forward reads trimming report" + - "reverse reads trimming report" + - "aligned MultiQC directory" + - "aligned MultiQC directory ZIP" + - "aligned ToTranscriptome Bam" + - "aligned SortedByCoord Bam" + - "aligned SortedByCoord ResortedBam" + - "aligned SortedByCoord ResortedBamIndex" + - "aligned log Final" + - "aligned log Progress" + - "aligned log Full" + - "aligned sjTab" + - "genebody coverage MultiQC directory" + - "genebody coverage MultiQC directory ZIP" + - "infer experiment MultiQC directory" + - "infer experiment MultiQC directory ZIP" + - "inner distance MultiQC directory" + - "inner distance MultiQC directory ZIP" + - "read distribution MultiQC directory" + - "read distribution MultiQC directory ZIP" + - "genebody coverage out" + - "infer experiment out" + - "inner distance out" + - "read distribution out" + - "RSEM counts MultiQC directory" + - "RSEM counts MultiQC directory ZIP" + - "star number non-zero count genes table" + - "star unnormalized counts table" + - "rsem number non-zero count genes table" + - "rsem unnormalized counts table" + - "sample reads per gene table" + - "sample gene counts table" + - "sample isoform counts table" + - "sample counts stats directory" + - "DESeq2 normalized counts table" + - "sample table" + - "DESeq2 unnormalized counts table" + - "DESeq2 contrasts table" + - "DESeq2 annotated DGE table" + - "DESeq2 annotated DGE extended for viz table" + - "DESeq2 viz PCA table" + DGE Output: + - "DESeq2 normalized counts table" + - "sample table" + - "DESeq2 unnormalized counts table" + - "DESeq2 contrasts table" + - "DESeq2 annotated DGE table" + - "DESeq2 annotated DGE extended for viz table" + - "DESeq2 viz PCA table" + ERCC DGE Output: + - "ERCC normalized DESeq2 normalized counts table" + - "ERCC sample table" + - "ERCC normalized DESeq2 contrasts table" + - "ERCC normalized DESeq2 annotated DGE table" + - "ERCC normalized DESeq2 annotated DGE extended for viz table" + - "ERCC normalized DESeq2 viz PCA table" + # NOTE: Not part of NF_WF yet - "ERCC analysis HTML" + RSEM Output: + - "RSEM counts MultiQC directory" + - "RSEM counts MultiQC directory ZIP" + - "rsem number non-zero count genes table" + - "rsem unnormalized counts table" diff --git a/download_multiqc_from_OSD.py b/download_multiqc_from_OSD.py new file mode 100644 index 0000000..f94f588 --- /dev/null +++ b/download_multiqc_from_OSD.py @@ -0,0 +1,54 @@ +import sys +from pathlib import Path +import zipfile + +import requests +import click + +from dp_tools.glds_api.commons import find_matching_filenames, retrieve_file_url + +@click.command() +@click.option("--osd-id", help='OSD Accession ID. e.g. "OSD-194"', required=True) +@click.option("--output-dir", help='Output directory', required=False, default=".") +def main(osd_id, output_dir): + + + files = find_matching_filenames(accession=osd_id, filename_pattern=".*multiqc.*.zip") + # Ensure we also download ISA archive + files.extend( + find_matching_filenames(accession=osd_id, filename_pattern=".*ISA.*.zip") + ) + + if not any(["align_" in f for f in files]): + print( + "Did not locate align_multiqc_report zip. Inferring this isn't a dataset to be used." + ) + sys.exit(0) + + + def download_file(url, local_filename): + print(f"Saving file: {local_filename} from {url}") + with requests.get(url, stream=True) as r: + r.raise_for_status() + with open(local_filename, "wb") as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + + def unzip_file(zip_file_name, extraction_location): + print(f"Unzipping file: {zip_file_name}") + # open the zip file in read mode + with zipfile.ZipFile(zip_file_name, 'r') as zip_ref: + # extract all the contents into the directory + zip_ref.extractall(extraction_location) + + # Setup output dir + output_dir = Path(output_dir) + if not output_dir.exists(): + output_dir.mkdir() + for f in files: + file_location = output_dir / f + download_file(retrieve_file_url(osd_id, f), file_location) + unzip_file(file_location, output_dir) + +if __name__ == '__main__': + main() diff --git a/dp_tools/__init__.py b/dp_tools/__init__.py index 902d8c4..cd404f2 100644 --- a/dp_tools/__init__.py +++ b/dp_tools/__init__.py @@ -28,7 +28,7 @@ #### Using pip -> pip install git+https://github.com/J-81/dp_tools.git@1.3.0 +> pip install git+https://github.com/J-81/dp_tools.git@1.3.4 ## CLI Commands @@ -74,7 +74,7 @@ ``` bash # First two lines tell Singularity to run the dp_tools container in the current working directory singularity exec --bind $(pwd):$(pwd) \\ - docker://quay.io/j_81/dp_tools:1.3.0 \\ + docker://quay.io/j_81/dp_tools:1.3.4 \\ dpt-get-isa-archive --accession GLDS-168 # command we want to run ``` @@ -82,7 +82,7 @@ ``` bash # First two lines tell Singularity to run the dp_tools container in the current working directory singularity exec --bind $(pwd):$(pwd) \\ - docker://quay.io/j_81/dp_tools:1.3.0 \\ + docker://quay.io/j_81/dp_tools:1.3.4 \\ dpt-isa-to-runsheet --accession GLDS-168 \\ --config-type bulkRNASeq \\ --config-version Latest \\ @@ -90,4 +90,4 @@ ``` """ -__version__ = "1.3.0" +__version__ = "1.3.4" diff --git a/dp_tools/core/check_model.py b/dp_tools/core/check_model.py index 78703e6..043ac3a 100644 --- a/dp_tools/core/check_model.py +++ b/dp_tools/core/check_model.py @@ -3,72 +3,84 @@ from contextlib import contextmanager import enum -from typing import ( - Callable, - TypedDict, - Union, - Literal -) +from typing import Callable, TypedDict, Union, Literal import pandas as pd from loguru import logger as log - - ALLOWED_DEV_EXCEPTIONS = ( Exception # Hooking into this with monkeypatch can be handy for testing ) -def run_manual_check(start_instruction, pass_or_fail_questions, pass_or_flag_questions) -> 'FlagEntry': - input(f"Manual Check Start Instructions: \n\t{start_instruction}.\nPress Enter to continue to questions..") + +def run_manual_check( + start_instruction, pass_or_fail_questions, pass_or_flag_questions +) -> "FlagEntry": + input( + f"Manual Check Start Instructions: \n\t{start_instruction}.\nPress Enter to continue to questions.." + ) top_level_code = FlagCode.GREEN def pass_or_fail_prompt(question: str): - ALLOWED = { # Lambda used to ensure both static and analyst responses can be supplied - "Y": (lambda: "Yes", FlagCode.GREEN), - "JF": (lambda: input("Expand on reason for failure: ").replace("\n",":::NEWLINE:::"), FlagCode.HALT), - "UF": (lambda: "No",FlagCode.HALT) - } - + ALLOWED = ( + { # Lambda used to ensure both static and analyst responses can be supplied + "Y": (lambda: "Yes", FlagCode.GREEN), + "JF": ( + lambda: input("Expand on reason for failure: ").replace( + "\n", ":::NEWLINE:::" + ), + FlagCode.HALT, + ), + "UF": (lambda: "No", FlagCode.HALT), + } + ) + while True: try: resp = ALLOWED[input(f"{question} (Y/JF/UF) : ").upper()] - return (resp[0](), resp[1]) # evalute in case justification is provided + return (resp[0](), resp[1]) # evalute in case justification is provided except KeyError: print(f"Invalid response! Only {list(ALLOWED)} values are allowed") continue - + def pass_or_flag_prompt(question: str): - ALLOWED = { # Lambda used to ensure both static and analyst responses can be supplied - "Y": (lambda: "Yes", FlagCode.GREEN), - "JF": (lambda: input("Expand on reason for failure: ").replace("\n",":::NEWLINE:::"), FlagCode.RED), - "UF": (lambda: "No",FlagCode.RED) - } - + ALLOWED = ( + { # Lambda used to ensure both static and analyst responses can be supplied + "Y": (lambda: "Yes", FlagCode.GREEN), + "JF": ( + lambda: input("Expand on reason for failure: ").replace( + "\n", ":::NEWLINE:::" + ), + FlagCode.RED, + ), + "UF": (lambda: "No", FlagCode.RED), + } + ) + while True: try: resp = ALLOWED[input(f"{question} (Y/JF/UF) : ").upper()] - return (resp[0](), resp[1]) # evalute in case justification is provided + return (resp[0](), resp[1]) # evalute in case justification is provided except KeyError: print(f"Invalid response! Only {list(ALLOWED)} values are allowed") continue - + responses: dict[str, dict[str, list[tuple[str, FlagCode]]]] = { - "pass/fail": {}, - "pass/flag": {}, - } + "pass/fail": {}, + "pass/flag": {}, + } for question in pass_or_fail_questions: - responses['pass/fail'][question] = pass_or_fail_prompt(question) - if responses['pass/fail'][question][1] == FlagCode.HALT: + responses["pass/fail"][question] = pass_or_fail_prompt(question) + if responses["pass/fail"][question][1] == FlagCode.HALT: top_level_code = FlagCode.HALT - + for question in pass_or_flag_questions: - responses['pass/flag'][question] = pass_or_flag_prompt(question) - if responses['pass/flag'][question][1] == FlagCode.RED: + responses["pass/flag"][question] = pass_or_flag_prompt(question) + if responses["pass/flag"][question][1] == FlagCode.RED: top_level_code = max([top_level_code, FlagCode.RED]) - return {"code": top_level_code, "message" : str(responses)} + return {"code": top_level_code, "message": str(responses)} class FlagCode(enum.Enum): @@ -321,7 +333,7 @@ def ancestry_is_in(self, other_list): class _QueuedCheck(TypedDict): """A queued check including checks that will be skipped""" - check_fcn: Callable[..., Union[FlagEntry,FlagEntryWithOutliers]] + check_fcn: Callable[..., Union[FlagEntry, FlagEntryWithOutliers]] """ A callable function that returns a flag entry or a string placeholder""" function: str @@ -486,7 +498,7 @@ def add( config: dict = None, description: str = None, full_description: str = None, - automated: bool = True + automated: bool = True, ): """Adds the check to the queue for each payload. Payload can be either supplied directly on the add invocation @@ -502,7 +514,7 @@ def add( description (str, optional): A description of the check function. Defaults to function name. Should be used if the function name doesn't adequately describe what is being checked. full_description (str, optional): A long, potentially multiline description of the check function. Defaults to function name. - NOT included in flag table but used to + NOT included in flag table but used to """ # override payload with one supplied directly to run if payloads: @@ -529,7 +541,7 @@ def add( # don't run if either this add call specifies skip # or if the component is being skipped "to_run": not any([skip, self.cur_component.skip]), - "automated": automated + "automated": automated, } ) return self @@ -540,7 +552,7 @@ def add_manual( start_instructions: str, skip: bool = False, pass_fail_questions: list[str] = list(), - pass_flag_questions: list[str] = list() + pass_flag_questions: list[str] = list(), ): """Adds the check to the queue for each payload. Payload can be either supplied directly on the add invocation @@ -556,38 +568,59 @@ def add_manual( description (str, optional): A description of the check function. Defaults to function name. Should be used if the function name doesn't adequately describe what is being checked. full_description (str, optional): A long, potentially multiline description of the check function. Defaults to function name. - NOT included in flag table but used to + NOT included in flag table but used to """ # Generate markdown style full description based on questions - pass_or_fail_block = '\n'.join([f" - {q}" for q in pass_fail_questions]) - pass_or_flag_block = '\n'.join([f" - {q}" for q in pass_flag_questions]) - pass_or_fail_section = "" if not pass_fail_questions else f"- Pass or Fail Questions:\n{pass_or_fail_block}" - pass_or_flag_section = "" if not pass_flag_questions else f"- Pass or Flag Questions:\n{pass_or_flag_block}" - full_description = textwrap.dedent(f""" + pass_or_fail_block = "\n".join( + [f" - {q}" for q in pass_fail_questions] + ) + pass_or_flag_block = "\n".join( + [f" - {q}" for q in pass_flag_questions] + ) + pass_or_fail_section = ( + "" + if not pass_fail_questions + else f"- Pass or Fail Questions:\n{pass_or_fail_block}" + ) + pass_or_flag_section = ( + "" + if not pass_flag_questions + else f"- Pass or Flag Questions:\n{pass_or_flag_block}" + ) + full_description = textwrap.dedent( + f""" - Manual Check: {description} {pass_or_fail_section} {pass_or_flag_section} - """) + """ + ) # Sanitize questions to ensure json serializable # Remove any single and double quotes from body and replace with tick marks - pass_fail_questions = [q.replace("'","`").replace('"',"`") for q in pass_fail_questions] - pass_flag_questions = [q.replace("'","`").replace('"',"`") for q in pass_flag_questions] - + pass_fail_questions = [ + q.replace("'", "`").replace('"', "`") for q in pass_fail_questions + ] + pass_flag_questions = [ + q.replace("'", "`").replace('"', "`") for q in pass_flag_questions + ] self._manual_check_queue.append( { - "check_fcn": "MANUAL_CHECK", # type: ignore + "check_fcn": "MANUAL_CHECK", # type: ignore "function": "MANUAL_CHECK", "description": description, "full_description": full_description, - "payload": {"start_instruction":start_instructions,"pass_or_fail_questions":pass_fail_questions, "pass_or_flag_questions":pass_flag_questions}, + "payload": { + "start_instruction": start_instructions, + "pass_or_fail_questions": pass_fail_questions, + "pass_or_flag_questions": pass_flag_questions, + }, "config": {}, "component": self.cur_component, # don't run if either this add call specifies skip # or if the component is being skipped "to_run": not any([skip, self.cur_component.skip]), - "automated": False + "automated": False, } ) return self @@ -607,7 +640,7 @@ def queued_checks( CHECK_PREFIX: str = " > ", INDENT_CHECKS_STR: str = " ", include_checks_counters: bool = True, - WRAP_COMPONENT_NAME_CHAR: str = "'" + WRAP_COMPONENT_NAME_CHAR: str = "'", ) -> str: """Returns a print-friendly string describing the queued checks. @@ -621,7 +654,7 @@ def queued_checks( Returns: str: A human friendly description of the queued checks. """ - description_field: Literal['full_description'] | Literal['description'] + description_field: Literal["full_description"] | Literal["description"] if long_description: description_field = "full_description" else: @@ -656,7 +689,9 @@ def render_self_and_children(component: ValidationProtocol._Component) -> str: count_str2 = f"[{len(check_by_component[component])}" lead_str = f"{INDENT_STR}{COMPONENT_PREFIX}{WRAP_COMPONENT_NAME_CHAR}{component.name}{WRAP_COMPONENT_NAME_CHAR}{'-> !SKIPPED!' if component.skip else ''}" if include_checks_counters: - buffer = f"{lead_str : <55}DIRECT:{count_str2 : >4}] ALL:{count_str : >5}]" + buffer = ( + f"{lead_str : <55}DIRECT:{count_str2 : >4}] ALL:{count_str : >5}]" + ) else: buffer = lead_str @@ -669,7 +704,8 @@ def render_self_and_children(component: ValidationProtocol._Component) -> str: ] ) check_line_print = [ - f"{line} x {line_count}" if line_count > 1 else line for line, line_count in check_lines.items() + f"{line} x {line_count}" if line_count > 1 else line + for line, line_count in check_lines.items() ] if check_lines: buffer += "\n" + "\n".join(check_line_print) @@ -696,7 +732,7 @@ def run(self, flag_unhandled_exceptions: bool = False): all_queued = self._check_queue + self._manual_check_queue for queued in all_queued: fcn = queued["check_fcn"] - if queued['automated']: + if queued["automated"]: fcn_name = fcn.__name__ else: fcn_name = "MANUAL_CHECK" @@ -746,7 +782,7 @@ def run(self, flag_unhandled_exceptions: bool = False): # peel off outlier data and keep track # using current component name as the top level key # Type ignored since FlagEntry dicts will return None as desired for this conditional - if fcn_outliers := result.pop("outliers", None): # type: ignore + if fcn_outliers := result.pop("outliers", None): # type: ignore self.outliers[queued["component"].name] = ( self.outliers[queued["component"].name] | fcn_outliers ) @@ -776,8 +812,9 @@ def run(self, flag_unhandled_exceptions: bool = False): } else: raise RuntimeError( - f"Function failed: {fcn_name} part of {queued['component']}" + f"Function failed: {fcn_name} part of {queued['component']} with original error message: {e}" ) from e + # add result (including skip flag) to component queued["component"].flags.append(packed_result) @@ -841,9 +878,11 @@ def report( # Preprocesing flags before tabulating df_data: list[dict] = list() for flag_result in unpreprocessed_df_data: - # Preprocess all 'message' and 'description' fit on one table line to ensure they fit on - flag_result['message'] = flag_result['message'].replace("\n","::NEWLINE::") - flag_result['description'] = flag_result['description'].replace("\n","::NEWLINE::") + # Preprocess all 'message' and 'description' fit on one table line to ensure they fit on + flag_result["message"] = flag_result["message"].replace("\n", "::NEWLINE::") + flag_result["description"] = flag_result["description"].replace( + "\n", "::NEWLINE::" + ) df_data.append(flag_result) diff --git a/dp_tools/core/configuration.py b/dp_tools/core/configuration.py index 78178b3..3a49fd3 100644 --- a/dp_tools/core/configuration.py +++ b/dp_tools/core/configuration.py @@ -60,6 +60,8 @@ def load_config(config: Union[tuple[str, str], Path]) -> dict: log.info(f"Loading config (direct path): {config}") with config.open() as f: conf_full = yaml.safe_load(f) + case _: + raise ValueError(f"Invalid config type: {type(config)}. Expected either a tuple (legacy builtin configurations) or Path object.") log.debug(f"Final config loaded: {conf_full}") diff --git a/dp_tools/core/entity_model.py b/dp_tools/core/entity_model.py index 64d20d9..d4ed880 100644 --- a/dp_tools/core/entity_model.py +++ b/dp_tools/core/entity_model.py @@ -164,7 +164,9 @@ def _load_asset( try: [asset] = asset.parent.glob(asset.name) except ValueError as exc: - raise ValueError(f"Failed to locate data asset using glob pattern: '{asset.name}'") from exc + raise ValueError( + f"Failed to locate data asset using glob pattern: '{asset.name}'" + ) from exc if not putative: assert asset.exists(), f"Failed to load asset at path '{asset}'" self.loaded_assets_dicts.append( @@ -192,7 +194,9 @@ def _load_asset( "config": {}, } ) - return DataAsset(key=key, path=asset, config=config, owner=owner, putative=putative) + return DataAsset( + key=key, path=asset, config=config, owner=owner, putative=putative + ) @property def loaded_assets_report(self) -> pd.DataFrame: diff --git a/dp_tools/core/files/isa_archive.py b/dp_tools/core/files/isa_archive.py index 8e457c8..d2e3465 100644 --- a/dp_tools/core/files/isa_archive.py +++ b/dp_tools/core/files/isa_archive.py @@ -7,7 +7,7 @@ import pandas as pd -log = logging.getLogger(__name__) +from loguru import logger as log ISA_INVESTIGATION_HEADERS = { "ONTOLOGY SOURCE REFERENCE", @@ -43,19 +43,29 @@ def isa_investigation_subtables(isaArchive: Path) -> dict[str, pd.DataFrame]: [i_file] = ( f for f in fetch_isa_files(isaArchive) if f.name.startswith("i_") ) - with open(i_file, "r") as f: - for line in [l.rstrip() for l in f.readlines()]: - # search for header - if line in ISA_INVESTIGATION_HEADERS: - if key != None: - tables[key] = pd.DataFrame( - table_lines - ).T # each subtable is transposed in the i_file - table_lines = list() - key = line # set next table key - else: - tokens = line.split("\t") # tab separated - table_lines.append(tokens) + # Default to 'utf-8' + try: + log.trace("Decoding ISA with 'utf-8") + with open(i_file, "r", encoding = "utf-8") as f: + lines = f.readlines() + # Fallback to "ISO-8859-1" if 'utf-8' fails + except UnicodeDecodeError: + log.warning("Failed using 'utf-8'. Decoding ISA with 'ISO-8859-1'") + with open(i_file, "r", encoding = "ISO-8859-1") as f: + lines = f.readlines() + for line in lines: + line = line.rstrip() + # search for header + if line in ISA_INVESTIGATION_HEADERS: + if key != None: + tables[key] = pd.DataFrame( + table_lines + ).T # each subtable is transposed in the i_file + table_lines = list() + key = line # set next table key + else: + tokens = line.split("\t") # tab separated + table_lines.append(tokens) tables[key] = pd.DataFrame( table_lines ).T # each subtable is transposed in the i_file diff --git a/dp_tools/core/post_processing.py b/dp_tools/core/post_processing.py index 11a4604..7a08709 100644 --- a/dp_tools/core/post_processing.py +++ b/dp_tools/core/post_processing.py @@ -224,7 +224,7 @@ def generate_new_column_dicts( ) # now remap those processing sample names to their orignal names, - # required for joining to orignal assay table + # required for fing to orignal assay table processing_to_orignal_mapping = pd.read_csv( dataset.data_assets["runsheet"].path, index_col="Sample Name" )["Original Sample Name"].to_dict() @@ -264,7 +264,7 @@ def generate_new_column_dicts( # joining by comma for header, header_wise in new_cols.items(): for sample, sample_wise in header_wise.items(): - new_value = ", ".join(sorted(list(new_cols[header][sample]))) + new_value = ",".join(sorted(list(new_cols[header][sample]))) new_cols[header][sample] = new_value diff --git a/dp_tools/core/utilites/metrics_extractor.py b/dp_tools/core/utilites/metrics_extractor.py new file mode 100644 index 0000000..8922cd8 --- /dev/null +++ b/dp_tools/core/utilites/metrics_extractor.py @@ -0,0 +1,1084 @@ +# Enum for assay types +import ast +from enum import Enum +import json +import pathlib +from pathlib import Path +from dataclasses import dataclass +from typing import Literal +import numpy as np + +import yaml +import pandas as pd +from loguru import logger + +from dp_tools.scripts import convert +from dp_tools.core.utilites import multiqc_tools + + +class AssayType(Enum): + bulkRNASeq = 1 + bulkRNASeq_VV = 2 + scRNASeq = 3 + spatialRNASeq = 4 + + +@dataclass +class MultiQCTargetSection: + targets: list[Path] + section_name: str + modules: list[str] + jsonTarget: list[str] | Literal[False] + + +class MetricsExtractor: + # Class Attributes + metrics: pd.DataFrame = pd.DataFrame() + mqc_metrics: pd.DataFrame = pd.DataFrame() + samplewise_metrics: pd.DataFrame = pd.DataFrame() + + def __init__(self, targets: list[MultiQCTargetSection]): + self.targets = targets + + # Ensure all column names are tuples + @staticmethod + def ensure_tuple(col_name, desired_length=4): + if isinstance(col_name, tuple): + # Pad with None if the tuple is smaller than desired_length + return col_name + (None,) * (desired_length - len(col_name)) + else: + # Create a tuple of desired_length with the col_name as the first element + return (col_name,) + (None,) * (desired_length - 1) + + def extract_general_information(self, assay_type: AssayType, yaml_file: str): + """This function parses data from a yaml file that is applicable on a dataset level. + + Examples include: + OSD-# + GLDS-# + Sample Name + Organism + Tissue Type + Library prep method (e.g. ribo-deplete (aka: totRNA) or polyA-enriched (aka: mRNA)) + % rRNA contamination + PE or SE + Stranded or Unstranded + Library prep kit + Data source (GeneLab generated, User-submitted, or Federated) + """ + # Parse the yaml file + with open(yaml_file) as file: + data = yaml.load(file, Loader=yaml.FullLoader) + + EXPECTED_KEYS = [ + "OSD-#", + "GLDS-#", + "Sample Name", + "Organism", + "Tissue Type", + "Library prep method", + "% rRNA contamination", + "PE or SE", + "Stranded or Unstranded", + "Library prep kit", + "Data source", + ] + + # Validate all keys and report all missing + missing_keys: list[str] = list() + for key in EXPECTED_KEYS: + if key not in data: + missing_keys.append(key) + + if missing_keys: + raise ValueError(f"Missing keys: {missing_keys}") + + def extract_sections(self): + def _extract_section_from_json( + self, section_name: str, json_file: Path, module: str + ): + # Load json data + with open(json_file) as file: + data = json.load(file) + + # Note: Certain modules like RSeQC don't produce a general stats table + # So we need to check if it exists before trying to extract it + if data["report_general_stats_data"]: + flat_data = multiqc_tools.get_reformated_source_dict( + data["report_general_stats_data"][ + 0 + ] # assumes only one module per multiQC json file + ) + + df_general_stats = pd.DataFrame(flat_data).T + + # Add a section as the first part of the column MultiIndex + df_general_stats.columns = pd.MultiIndex.from_tuples( + [ + (section_name, f"multiqc_{module}", "general_stats", col) + for col in df_general_stats.columns + ] + ) + + # Add section_name as last part of row MultiIndex + df_general_stats.index = df_general_stats.index.set_names( + ["sample name", "sample subcomponent"] + ) + + df_general_stats.index = pd.MultiIndex.from_tuples( + list( + zip( + df_general_stats.index.get_level_values( + "sample name" + ).str.replace( + "-", "_" + ), # Plots data performs this conversion so we match it here + df_general_stats.index.get_level_values( + "sample subcomponent" + ), + [section_name] * len(df_general_stats.index), + ) + ) + ).set_names(["sample name", "sample subcomponent", "name"]) + + df_updated_metrics = df_general_stats + has_general_stats = True + else: + has_general_stats = False + + # Same for plot data + df_plot_data = multiqc_tools.format_plots_as_dataframe( + data["report_plot_data"] + ) + # Add section_name as last part of row MultiIndex + df_plot_data.index = df_plot_data.index.set_names( + ["sample name", "sample subcomponent"] + ) + + df_plot_data.index = pd.MultiIndex.from_tuples( + list( + zip( + df_plot_data.index.get_level_values("sample name"), + df_plot_data.index.get_level_values("sample subcomponent"), + [section_name] * len(df_plot_data.index), + ) + ) + ).set_names(["sample name", "sample subcomponent", "name"]) + + # Add a section as the first part of the column MultiIndex + df_plot_data.columns = pd.MultiIndex.from_tuples( + [(section_name, *col) for col in df_plot_data.columns] + ) + + if has_general_stats: + df_updated_metrics = df_updated_metrics.merge( + df_plot_data, left_index=True, right_index=True + ) + else: + df_updated_metrics = df_plot_data + + # Convert all columns to tuples + columns_as_tuples = df_updated_metrics.columns.map(self.ensure_tuple) + + # Create MultiIndex + df_updated_metrics.columns = pd.MultiIndex.from_tuples(columns_as_tuples) + + self.metrics = self.metrics.append(df_updated_metrics) + + def _extract_section( + self, section_name: str, files: list[Path], modules: list[str] + ): + mqc_ret = multiqc_tools.get_parsed_data( + input_f=[str(f) for f in files], modules=modules, as_dataframe=False + ) + flat_data = multiqc_tools.flatten_raw_data(mqc_ret["report"]) + + df_general_stats = pd.DataFrame(flat_data).T + + # Add section_name as last part of row MultiIndex + df_general_stats.index = df_general_stats.index.set_names( + ["sample name", "sample subcomponent"] + ) + + df_general_stats.index = pd.MultiIndex.from_tuples( + list( + zip( + df_general_stats.index.get_level_values("sample name"), + df_general_stats.index.get_level_values("sample subcomponent"), + [section_name] * len(df_general_stats.index), + ) + ) + ).set_names(["sample name", "sample subcomponent", "name"]) + + # Metrics names may include '-', whereas all multiQC names covert these to '_' + # So here we create a temporary column with the '-' replaced with '_' for merging purposes + # if isinstance(self.metrics.index, pd.MultiIndex): + # idx_sample_name = self.metrics.index.get_level_values('sample name') + # idx_sample_name = idx_sample_name.str.replace('-','_') + # self.metrics.index = pd.MultiIndex.from_arrays( + # [ + # idx_sample_name, + # self.metrics.index.get_level_values('sample subcomponent'), + # self.metrics.index.get_level_values('name'), + # ], + # names = ['sample name','sample subcomponent','name'] + # ) + # else: + # self.metrics.index = self.metrics.index.str.replace('-','_') + # # self.metrics.index = self.metrics.index.str.replace('-','_') + + df_updated_metrics = df_general_stats + + # Same for plot data + df_plot_data = multiqc_tools.format_plots_as_dataframe(mqc_ret) + # Add section_name as last part of row MultiIndex + df_plot_data.index = df_plot_data.index.set_names( + ["sample name", "sample subcomponent"] + ) + + df_plot_data.index = pd.MultiIndex.from_tuples( + list( + zip( + df_plot_data.index.get_level_values("sample name"), + df_plot_data.index.get_level_values("sample subcomponent"), + [section_name] * len(df_plot_data.index), + ) + ) + ).set_names(["sample name", "sample subcomponent", "name"]) + + df_updated_metrics = df_updated_metrics.merge( + df_plot_data, left_index=True, right_index=True + ) + + # Add a section as the first part of the column MultiIndex + df_updated_metrics.columns = pd.MultiIndex.from_tuples( + [(section_name, *col) for col in df_updated_metrics.columns] + ) + + self.metrics = self.metrics.append(df_updated_metrics) + + for target in self.targets: + if target.jsonTarget == False: + _extract_section( + self, target.section_name, target.targets, target.modules + ) + else: + _extract_section_from_json( + self, target.section_name, target.jsonTarget, target.modules[0] + ) # Restriction: Can only handle one module multiQC json + # Convert index of three part tuple to MultiIndex + # Unnamed so must access part position in tuple + # self.metrics.index = pd.MultiIndex.from_tuples(self.metrics.index, names = ['sample name','sample subcomponent','name']) + # self.metrics.index = self.metrics.index.set_names(['sample name','sample subcomponent','name']) + + # Merge in samplewise metrics + metrics_reset = self.metrics.reset_index(level=["sample subcomponent", "name"]) + + samplewise_metrics_cleaned = self.samplewise_metrics.copy() + samplewise_metrics_cleaned.index = samplewise_metrics_cleaned.index.str.replace( + "-", "_" + ) + + merged = metrics_reset.merge( + samplewise_metrics_cleaned, + how="left", + left_on="sample name", + right_index=True, + ) + # Rename based on length of coerced tuples + merged = merged.rename( + columns={ + ("sample subcomponent", "", "", ""): "sample subcomponent", + ("name", "", "", ""): "name", + } + ) + merged = merged.set_index(["sample subcomponent", "name"], append=True) + + self.metrics = merged + + def extract_data_from_isa( + self, accession: str, isa_archive: pathlib.Path, config: tuple[str, str] + ): + class mock_schema: + @staticmethod + def validate(df): + pass + + samplewise_metrics = convert.isa_to_runsheet( + accession, + isa_archive, + config, + schema=mock_schema(), # type: ignore + assert_factor_values=False, + ) + + self.samplewise_metrics = samplewise_metrics + + def append_manual_yaml_data(self, target_yaml: Path): + # Start with df_isa and add columns for each key value in yaml + with open(target_yaml) as file: + new_data = yaml.safe_load(file) + + # Add the new data to the existing data as new columns + for key, value in new_data.items(): + self.samplewise_metrics[key] = value + + def process_metrics(self, assay_type: AssayType): + match assay_type: + case AssayType.bulkRNASeq: + df_samplewise = pd.DataFrame() + + # Copy here is inefficient but useful to keep original dataframe unmodified + df_interim = self.metrics.copy() + + # Ensure all column names are tuples + def ensure_tuple(col_name, desired_length=4): + if isinstance(col_name, tuple): + # Pad with None if the tuple is smaller than desired_length + return col_name + (None,) * (desired_length - len(col_name)) + else: + # Create a tuple of desired_length with the col_name as the first element + return (col_name,) + (None,) * (desired_length - 1) + + # Convert all columns to tuples + columns_as_tuples = df_interim.columns.map(ensure_tuple) + + # Create MultiIndex + df_interim.columns = pd.MultiIndex.from_tuples(columns_as_tuples) + + def _process_fastqc_data(df_full: pd.DataFrame, section_name: str): + df_samplewise = pd.DataFrame() + # Raw reads + df_fastqc_subset = df_full.xs( + key=(section_name, "_R1"), + axis="rows", + level=["name", "sample subcomponent"], + ) + + # M Seqs (read depth) + df_samplewise.index = df_fastqc_subset.index + try: + df_samplewise["Total Seqs"] = df_fastqc_subset[ + ( + section_name, + "multiqc_fastqc", + "general_stats", + "Total Sequences", + ) + ].astype(int) + except ( + KeyError + ): # Sometimes the key is named differently (e.g. OSD-511) + df_samplewise["Total Seqs"] = df_fastqc_subset[ + ( + section_name, + "multiqc_fastqc", + "general_stats", + "total_sequences", + ) + ].astype(int) + + # Read length (This can be a range, especially for trimmed reads) + df_samplewise["Mean Read Length"] = df_fastqc_subset[ + ( + section_name, + "multiqc_fastqc", + "general_stats", + "avg_sequence_length", + ) + ] + + # # Check column type and perform action + # def _process_read_length_column(column): + # if np.issubdtype(column.dtype, np.number): + # return column.astype(int), column.astype(int) + # elif column.dtype == object: + # min_length = column.str.split("-").apply( + # lambda read_length_list: read_length_list[0] + # ) + # max_length = column.str.split("-").apply( + # lambda read_length_list: read_length_list[1] + # ) + # return min_length.astype(int), max_length.astype(int) + # raise ValueError( + # "Column type not recognized. Expected int or string like object (i.e. '20-151')" + # ) + + # ( + # df_samplewise["Min Read Length"], + # df_samplewise["Max Read Length"], + # ) = _process_read_length_column(read_length_series) + + # Mean & Median Q Score (Across all bases) + df_samplewise = df_samplewise.merge( + ( + df_full.xs( + key=section_name, + axis="rows", + level="name", + ) + .xs( + key=(section_name, "FastQC: Mean Quality Scores"), + axis="columns", + level=[0, 1], + ) + .agg(["mean", "median"], axis="columns") + ).rename( + columns={ + "mean": "Average Q Score (Across all plotted base positions)", + "median": "Median Q Score (Across all plotted base positions)", + } + ), + left_index=True, + right_index=True, + ) + + # % Dups + try: + df_samplewise = 100 - df_samplewise.merge( + ( + df_full.xs( + key=( + section_name, + "multiqc_fastqc", + "general_stats", + ), + axis="columns", + ).xs(key=(section_name), axis="rows", level="name")[ + "total_deduplicated_percentage" + ] + ).rename("% Dups"), + left_index=True, + right_index=True, + ) + except: # Sometimes key names are different + df_samplewise = df_samplewise.merge( + ( + df_full.xs( + key=( + section_name, + "multiqc_fastqc", + "general_stats", + ), + axis="columns", + ).xs(key=(section_name), axis="rows", level="name")[ + "percent_duplicates" + ] + ).rename("% Dups"), + left_index=True, + right_index=True, + ) + + # Mean %GC + try: + df_samplewise = df_samplewise.merge( + df_full.xs( + key=( + section_name, + "multiqc_fastqc", + "general_stats", + "%GC", + ), + axis="columns", + ) + .xs(key=(section_name), axis="rows", level="name") + .rename("Mean %GC"), + left_index=True, + right_index=True, + ) + except KeyError: # %GC <-> percent_gc + df_samplewise = df_samplewise.merge( + df_full.xs( + key=( + section_name, + "multiqc_fastqc", + "general_stats", + "percent_gc", + ), + axis="columns", + ) + .xs(key=(section_name), axis="rows", level="name") + .rename("Mean %GC"), + left_index=True, + right_index=True, + ) + + def _first_col_reaching_min(row, min_value): + # Use boolean indexing to get columns that are >= min_value + valid = row[row >= min_value] + + # Return the first column name that satisfies the condition, or None if not found + return valid.index[0] if not valid.empty else None + + def _last_col_reaching_min(row, min_value): + # Use boolean indexing to get columns that are >= min_value + valid = row[row >= min_value] + + # Return the last column name that satisfies the condition, or None if not found + return valid.index[-1] if not valid.empty else None + + def _get_first_column_where_cumulative_sum_exceeds_proportion_of_row_sum( + row, proportion + ): + # Find all columns that exceed the proportion of the row sum + valid = row[row.cumsum() >= proportion * row.sum()] + + # Return the first column name that satisfies the condition, or None if not found + return valid.index[0] if not valid.empty else None + + df_gc_plots = df_full.xs( + key=section_name, + axis="rows", + level="name", + ).xs( + key=( + section_name, + "FastQC: Per Sequence GC Content", + "FastQC: Per Sequence GC Content", + ), + axis="columns", + level=[0, 1, 2], + ) + + # Min %GC Reaching %1 Counts + df_samplewise = df_samplewise.merge( + ( + df_gc_plots.apply( + _first_col_reaching_min, axis="columns", min_value=1 + ).apply(lambda s: s.split()[0]) + ).rename("Min %GC reaching 1% Counts"), + left_index=True, + right_index=True, + ) + + # Max %GC Reaching %1 Counts + df_samplewise = df_samplewise.merge( + ( + df_gc_plots.apply( + _last_col_reaching_min, axis="columns", min_value=1 + ).apply(lambda s: s.split()[0]) + ).rename("Max %GC reaching 1% Counts"), + left_index=True, + right_index=True, + ) + + # 25% Quartile AUC Point %GC + df_samplewise = df_samplewise.merge( + ( + df_gc_plots.apply( + _get_first_column_where_cumulative_sum_exceeds_proportion_of_row_sum, + axis="columns", + proportion=0.25, + ).apply(lambda s: s.split()[0]) + ).rename("25% Quartile AUC Point %GC"), + left_index=True, + right_index=True, + ) + # 50% Quartile AUC Point %GC + df_samplewise = df_samplewise.merge( + ( + df_gc_plots.apply( + _get_first_column_where_cumulative_sum_exceeds_proportion_of_row_sum, + axis="columns", + proportion=0.50, + ).apply(lambda s: s.split()[0]) + ).rename("50% Quartile AUC Point %GC"), + left_index=True, + right_index=True, + ) + # 75% Quartile AUC Point %GC + df_samplewise = df_samplewise.merge( + ( + df_gc_plots.apply( + _get_first_column_where_cumulative_sum_exceeds_proportion_of_row_sum, + axis="columns", + proportion=0.75, + ).apply(lambda s: s.split()[0]) + ).rename("75% Quartile AUC Point %GC"), + left_index=True, + right_index=True, + ) + + # % N Content + df_n_content_plots = df_full.xs( + key=section_name, + axis="rows", + level="name", + ).xs( + key=( + section_name, + "FastQC: Per Base N Content", + "FastQC: Per Base N Content", + ), + axis="columns", + level=[0, 1, 2], + ) + + # % N Content Summed Across All Plotted Bases Positions + df_samplewise = df_samplewise.merge( + df_n_content_plots.sum(axis="columns").rename( + "% N Content Summed Across All Plotted Bases Positions" + ), + left_index=True, + right_index=True, + ) + + return df_samplewise + + def _process_align_data(df_full: pd.DataFrame, section_name: str): + df_samplewise = pd.DataFrame() + + df_align_subset = ( + df_full.xs( + key=section_name, + axis="rows", + level="name", + ) + .xs(key=section_name, axis="columns", level=0) + .droplevel("sample subcomponent", axis="rows") + ) + + df_samplewise.index = df_align_subset.index + df_samplewise["% Uniquely mapped"] = df_align_subset[ + ( + "multiqc_star", + "general_stats", + "uniquely_mapped_percent", + ) + ].astype(float) + + df_samplewise["% Mapped to multiple loci"] = df_align_subset[ + ("multiqc_star", "general_stats", "multimapped_percent") + ].astype(float) + + df_samplewise["% Mapped to too many loci"] = df_align_subset[ + ( + "multiqc_star", + "general_stats", + "multimapped_toomany_percent", + ) + ].astype(float) + + df_samplewise["% Unmapped too short"] = df_align_subset[ + ( + "multiqc_star", + "general_stats", + "unmapped_tooshort_percent", + ) + ].astype(float) + + df_samplewise["% Unmapped other"] = df_align_subset[ + ( + "multiqc_star", + "general_stats", + "unmapped_other_percent", + ) + ].astype(float) + + return df_samplewise + + def _process_rseqc_genebody_coverage_data( + df_full: pd.DataFrame, section_name: str + ): + df_samplewise = pd.DataFrame() + + df_rseqc_subset = ( + df_full.xs( + key=section_name, + axis="rows", + level="name", + ) + .xs(key=section_name, axis="columns", level=0) + .droplevel("sample subcomponent", axis="rows") + ) + + df_samplewise.index = df_rseqc_subset.index + + # Average % Coverage from 5-20 percentile (5' end coverage) + def _get_mean_for_percentile_range(min_range: int, max_range: int): + level_0_value = "RSeQC: Gene Body Coverage" + level_1_value = "RSeQC: Gene Body Coverage" + level_2_values_to_select = [ + f"{i} Gene Body Percentile (5' -> 3') (% Coverage)" + for i in range(min_range, max_range + 1) + ] # list of desired values for third level + + mask = ( + ( + df_rseqc_subset.columns.get_level_values(0) + == level_0_value + ) + & ( + df_rseqc_subset.columns.get_level_values(1) + == level_1_value + ) + & ( + df_rseqc_subset.columns.get_level_values(2).isin( + level_2_values_to_select + ) + ) + ) + + return ( + df_rseqc_subset.loc[:, mask] + .astype(float) + .mean(axis="columns") + ) + + @dataclass + class TARGET_RANGE: + lower_bound: int + upper_bound: int + label: str + + TARGET_RANGES: list[TARGET_RANGE] = [ + TARGET_RANGE(5, 20, "5' end coverage"), + TARGET_RANGE(40, 60, "middle coverage"), + TARGET_RANGE(80, 95, "3' end coverage"), + ] + + for target in TARGET_RANGES: + df_samplewise[ + f"Average % Coverage from {target.lower_bound}-{target.upper_bound} percentile ({target.label})" + ] = _get_mean_for_percentile_range( + target.lower_bound, target.upper_bound + ) + + df_samplewise["Ratio of 3' end coverage to 5' end coverage"] = ( + df_samplewise[ + f"Average % Coverage from 80-95 percentile (3' end coverage)" + ] + / df_samplewise[ + f"Average % Coverage from 5-20 percentile (5' end coverage)" + ] + ) + + return df_samplewise + + def _process_rseqc_infer_experiment_data( + df_full: pd.DataFrame, section_name: str + ): + df_samplewise = pd.DataFrame() + + df_rseqc_subset = ( + df_full.xs( + key=section_name, + axis="rows", + level="name", + ) + .xs(key=section_name, axis="columns", level=0) + .droplevel("sample subcomponent", axis="rows") + ) + + df_samplewise["% Sense"] = df_rseqc_subset[ + ( + "RSeQC: Infer experiment", + "RSeQC: Infer experiment", + "Sense (% Tags)", + ) + ].astype(float) + + df_samplewise["% Antisense"] = df_rseqc_subset[ + ( + "RSeQC: Infer experiment", + "RSeQC: Infer experiment", + "Antisense (% Tags)", + ) + ].astype(float) + + df_samplewise["% Undetermined"] = df_rseqc_subset[ + ( + "RSeQC: Infer experiment", + "RSeQC: Infer experiment", + "Undetermined (% Tags)", + ) + ].astype(float) + + return df_samplewise + + def _process_rseqc_inner_distance_data( + df_full: pd.DataFrame, section_name: str + ): + df_samplewise = pd.DataFrame() + + df_rseqc_subset = ( + df_full.xs( + key=section_name, + axis="rows", + level="name", + ) + .xs(key=section_name, axis="columns", level=0) + .droplevel("sample subcomponent", axis="rows") + ) + + # Inner Distance Peak Distance + # Extract from column tuple + # Example: ('RSeQC: Inner Distance', 'RSeQC: Inner Distance', '-117.5 Inner Distance (bp) (Counts)') + # Yields: -117.5 + try: + df_samplewise["Peak Inner Distance"] = ( + df_rseqc_subset.idxmax(axis="columns") + .apply(lambda col: col[2]) + .astype(float) + ) + except ( + ValueError + ): # e.g. ValueError: could not convert string to float: '-142.5 Inner Distance (bp) (Counts)' + df_samplewise["Peak Inner Distance"] = ( + df_rseqc_subset.idxmax(axis="columns") + .apply(lambda col: col[2].split()[0]) + .astype(float) + ) + + # % Reads At Inner Distance Peak Distance + df_samplewise["% Reads At Peak Inner Distance"] = ( + df_rseqc_subset.max(axis="columns") + / df_rseqc_subset.sum(axis="columns") + * 100 + ) + + # TAGUP: Inner distance at 1% + + return df_samplewise + + def _process_rseqc_read_distribution_data( + df_full: pd.DataFrame, section_name: str + ): + df_samplewise = pd.DataFrame() + + df_rseqc_subset = ( + df_full.xs( + key=section_name, + axis="rows", + level="name", + ) + .xs(key=section_name, axis="columns", level=0) + .droplevel("sample subcomponent", axis="rows") + ) + + @dataclass + class TARGET_LABELS: + dataframe_name: str + metrics_name: str + + TARGETS: list[TARGET_LABELS] = [ + TARGET_LABELS("CDS_Exons (# Tags)", "% CDS_Exons"), + TARGET_LABELS("5'UTR_Exons (# Tags)", "% 5'UTR_Exons"), + TARGET_LABELS("3'UTR_Exons (# Tags)", "% 3'UTR_Exons"), + TARGET_LABELS("Introns (# Tags)", "% Introns"), + TARGET_LABELS("TSS_up_1kb (# Tags)", "% TSS_up_1kb"), + TARGET_LABELS("TSS_up_1kb-5kb (# Tags)", "% TSS_up_5kb"), + TARGET_LABELS("TSS_up_5kb-10kb (# Tags)", "% TSS_up_10kb"), + TARGET_LABELS("TES_down_1kb (# Tags)", "% TES_down_1kb"), + TARGET_LABELS("TES_down_1kb-50kb (# Tags)", "% TES_down_5kb"), + TARGET_LABELS("TES_down_5kb-10kb", "% TES_down_10kb"), + TARGET_LABELS( + "Other_intergenic (# Tags)", "% Other_intergenic" + ), + ] + + for target in TARGETS: + try: + # Plot data + df_samplewise[target.metrics_name] = df_rseqc_subset[ + ( + "RSeQC: Read Distribution", + "RSeQC: Read Distribution", + target.dataframe_name, + ) + ] + except: + # No plot data means zero for the given tag + df_samplewise[target.metrics_name] = 0 + + # Convert all to percents by summing across row and dividing each by sum + df_samplewise = df_samplewise.apply( + lambda col: col / df_samplewise.sum(axis="columns") * 100 + ) + + return df_samplewise + + df_samplewise_raw = _process_fastqc_data(df_interim, "raw reads") + df_samplewise_trimmed = _process_fastqc_data( + df_interim, "trimmed reads" + ) + + df_samplewise_align = _process_align_data(df_interim, "aligned reads") + + #df_samplewise_rseqc_genebody_coverage = ( + # _process_rseqc_genebody_coverage_data( + # df_interim, "rseqc: genebody coverage" + # ) + #) + + #df_samplewise_rseqc_infer_experiment = ( + # _process_rseqc_infer_experiment_data( + # df_interim, "rseqc: infer experiment" + # ) + #) + + #df_samplewise_rseqc_inner_distance = _process_rseqc_inner_distance_data( + # df_interim, "rseqc: inner distance" + #) + + #df_samplewise_rseqc_read_distribution = ( + # _process_rseqc_read_distribution_data( + # df_interim, "rseqc: read distribution" + # ) + #) + + # Merge all + df_merged = ( + df_samplewise_raw.merge( + df_samplewise_trimmed, + left_index=True, + right_index=True, + suffixes=(" Raw", " Trimmed"), + ) + .merge( + df_samplewise_align, + left_index=True, + right_index=True, + ) + .sort_index() + ) + + return df_merged + + case AssayType.bulkRNASeq_VV: + df_samplewise = pd.DataFrame() + + # Copy here is inefficient but useful to keep original dataframe unmodified + df_interim = self.metrics.copy() + + # Convert all columns to tuples + columns_as_tuples = df_interim.columns.map(self.ensure_tuple) + + # Create MultiIndex + df_interim.columns = pd.MultiIndex.from_tuples(columns_as_tuples) + + # Raw reads + raw_reads = df_interim.xs( + key=("raw reads", "_R1"), + axis="rows", + level=["name", "sample subcomponent"], + ) + + # Read Depth Range + df_samplewise.index = raw_reads.index + df_samplewise["Total Seqs"] = raw_reads[ + ("raw reads", "multiqc_fastqc", "general_stats", "Total Sequences") + ].astype(int) + + # Read length + df_samplewise["Read Length"] = raw_reads[ + ("raw reads", "multiqc_fastqc", "general_stats", "Sequence length") + ].astype(int) + + # Mean & Median Q Score (Across all bases) + df_samplewise = df_samplewise.merge( + ( + df_interim.xs( + key="raw reads", + axis="rows", + level="name", + ) + .xs( + key=("raw reads", "FastQC: Mean Quality Scores"), + axis="columns", + level=[0, 1], + ) + .agg(["mean", "median"], axis="columns") + ).rename( + columns={ + "mean": "Average Q Score (Across all plotted base positions)", + "median": "Median Q Score (Across all plotted base positions)", + } + ), + left_index=True, + right_index=True, + ) + + # % Dups + + print("DONE") + case _: + raise NotImplementedError( + f"Assay type {assay_type} not implemented for summarization." + ) + + def load_metrics_csv(self, metrics_csv: Path): + # check\ metrics hasn't been created yet or loaded + assert self.metrics.equals( + pd.DataFrame() + ), "Metrics already loaded. Please create a new MetricsExtractor object." + + self.metrics = pd.read_csv(metrics_csv, index_col=[0, 1, 2]) + + # Set index names + self.metrics.index = self.metrics.index.set_names( + ["sample name", "sample subcomponent", "name"] + ) + + # Convert column names to tuples if they represent valid tuples + def _convert_to_tuple_if_valid(col_name): + try: + # Check if the column name can be evaluated to a tuple + result = ast.literal_eval(col_name) + if isinstance(result, tuple): + return result + except (SyntaxError, ValueError): + pass + return col_name + + self.metrics.columns = [ + _convert_to_tuple_if_valid(col) for col in self.metrics.columns + ] + + +def generate_extractor_from_yaml_config(config: Path) -> MetricsExtractor: + with open(config) as file: + config_data = yaml.safe_load(file) + + targets: list[MultiQCTargetSection] = list() + + for section in config_data["Extraction Settings"]["sections"]: + if not section["enabled"]: + logger.info(f"Skipping {section['name']} because it is disabled.") + continue + + # Set up MultiQC targets + search_dir = Path( + config_data["Extraction Settings"]["root search directory"] + ) / Path(*section["multiQC"]["logs directory"]) + + if section["multiQC"].get("from json", False): + jsonTarget = Path( + config_data["Extraction Settings"]["root search directory"] + ) / Path(*section["multiQC"]["from json"]) + else: + jsonTarget = False + + found_files: list[Path] = list() + for logs_pattern in section["multiQC"]["logs pattern(s)"]: + if section["multiQC"]["search recursively"]: + found_files.extend(list(search_dir.rglob(logs_pattern))) + else: + found_files.extend(list(search_dir.glob(logs_pattern))) + + # Catch empty lists + if len(found_files) == 0 and not jsonTarget: + raise ValueError( + f"No files found for {section['name']}. Configuration may be broken or consider disabling section if data is not present." + ) + + targets.append( + MultiQCTargetSection( + targets=found_files, + section_name=section["name"], + modules=section["multiQC"]["modules"], + jsonTarget=jsonTarget, + ) + ) + + return MetricsExtractor(targets=targets) diff --git a/dp_tools/core/utilites/multiqc_tools.py b/dp_tools/core/utilites/multiqc_tools.py index 7d053f5..09cf847 100644 --- a/dp_tools/core/utilites/multiqc_tools.py +++ b/dp_tools/core/utilites/multiqc_tools.py @@ -5,6 +5,7 @@ from typing import List, TypedDict import logging + log = logging.getLogger(__name__) import multiqc @@ -12,15 +13,13 @@ # iterable to remove suffixes and add them as subsource descriptors SUBSOURCES = [ - "_R1_raw", - "_R2_raw", "_R1", "_R2", "__STARpass1", ] # iterable to remove suffixes that does NOT add them as subsource descriptors (often due to the name being redundantly associated with columns) -SCRUB_SAMPLES = ["_read_dist", "_infer_expt"] +SCRUB_SAMPLES = ["_read_dist", "_infer_expt", "_raw"] def clean_messy_sample(messy_sample: str): @@ -93,12 +92,12 @@ def get_parsed_data( log.info(f"Using MQC to parse: {input_f}") try: # a workaround for flushing handlers in MQC version 1.11 - logger = log.getLogger("multiqc") - [logger.removeHandler(h) for h in logger.handlers] + # logger = log.getLogger("multiqc") + # [logger.removeHandler(h) for h in logger.handlers] mqc_ret = multiqc.run( - input_f, - no_data_dir=True, - module=modules, + input_f, + no_data_dir=True, + module=modules, quiet=True, no_ansi=True, ) # note: empty list for modules falls back on all modules @@ -170,15 +169,20 @@ class MQCRunDict(TypedDict): def get_general_stats(mqc_run_output: MQCRunDict) -> dict[str, dict]: returnDict = dict() report = mqc_run_output["report"] - mqc_modules = [list(header_entry.values())[0]['namespace'] for header_entry in report.general_stats_headers] + mqc_modules = [ + list(header_entry.values())[0]["namespace"] + for header_entry in report.general_stats_headers + ] for mqc_module, single_module_data in zip(mqc_modules, report.general_stats_data): returnDict[mqc_module] = single_module_data return returnDict -def format_plots_as_dataframe(mqc_rep: MQCRunDict) -> pd.DataFrame: +def format_plots_as_dataframe(mqc_rep: MQCRunDict | dict) -> pd.DataFrame: log.info(f"Formatting to dataframe") - mqc_rep = mqc_rep["report"] + if getattr(mqc_rep, "report", False): + mqc_rep = mqc_rep["report"] + # ingest plot data flat_plot_dict = format_plot_data(mqc_rep) # reformat to flatten list of dicts into single dict @@ -278,9 +282,11 @@ def __parse_xy_line_graph_to_flat_dict(plot_data): def format_plot_data(mqc_rep: dict): - log.info(f"Attempting to extract data from {len(mqc_rep.plot_data)} plots") + if mqc_rep.get("report"): + mqc_rep = mqc_rep.get("report").plot_data + log.info(f"Attempting to extract data from {len(mqc_rep)} plots") all_clean_data = dict() - for plot_key, plot_data in mqc_rep.plot_data.items(): + for plot_key, plot_data in mqc_rep.items(): log.info( f"Attempting to extract data from plot with Title: {plot_data['config']['title']}" ) diff --git a/dp_tools/glds_api/commons.py b/dp_tools/glds_api/commons.py index e3d8f12..d13655d 100644 --- a/dp_tools/glds_api/commons.py +++ b/dp_tools/glds_api/commons.py @@ -11,10 +11,10 @@ log = logging.getLogger(__name__) -GENELAB_DATASET_FILES = "https://genelab-data.ndc.nasa.gov/genelab/data/glds/files/{accession_number}" +GENELAB_DATASET_FILES = "https://osdr.nasa.gov/genelab/data/glds/files/{accession_number}" """ Template URL to access json of files for a single GLDS accession ID """ -FILE_RETRIEVAL_URL_PREFIX = "https://genelab-data.ndc.nasa.gov{suffix}" +FILE_RETRIEVAL_URL_PREFIX = "https://osdr.nasa.gov{suffix}" """ Used to retrieve files using remote url suffixes listed in the 'Data Query' API """ @functools.cache @@ -37,6 +37,7 @@ def get_table_of_files(accession: str) -> pd.DataFrame: # fetch data log.info(f"URL Source: {url}") + print(url) with urlopen(url) as response: data = yaml.safe_load(response.read()) df = pd.DataFrame(data['studies'][accession_osd]['study_files']) diff --git a/dp_tools/scripts/convert.py b/dp_tools/scripts/convert.py index 6c0e744..99010f4 100644 --- a/dp_tools/scripts/convert.py +++ b/dp_tools/scripts/convert.py @@ -6,6 +6,7 @@ from dp_tools.config import schemas from dp_tools.core.configuration import load_config from dp_tools.core.files import isa_archive +from dp_tools.core.files.isa_archive import isa_investigation_subtables from dp_tools.glds_api.commons import retrieve_file_url from dp_tools import plugin_api @@ -23,66 +24,6 @@ class BulkRNASeqMetadataComponent: pass -# TODO: refactor this with the analogous metadata component method -def isa_investigation_subtables(ISAarchive: Path) -> dict[str, pd.DataFrame]: - tables: dict[str, pd.DataFrame] = dict() - - # track sub table lines - table_lines: List[list] = list() - key: str = None # type: ignore - - try: - [i_file] = ( - f - for f in isa_archive.fetch_isa_files(ISAarchive) - if f.name.startswith("i_") - ) - except ValueError: - raise FileNotFoundError( - f"Could not find an i_* file inside: {ISAarchive.name}, is this an ISA archive?" - ) - with open(i_file, "r") as f: - for line in [l.rstrip() for l in f.readlines()]: - # search for header - if line in isa_archive.ISA_INVESTIGATION_HEADERS: - if key != None: - tables[key] = pd.DataFrame( - table_lines - ).T # each subtable is transposed in the i_file - table_lines = list() - key = line # set next table key - else: - tokens = line.split("\t") # tab separated - table_lines.append(tokens) - tables[key] = pd.DataFrame( - table_lines - ).T # each subtable is transposed in the i_file - - # reformat each table - def clean_quotes(string: str) -> str: - SINGLE_OR_DOUBLE_QUOTES = "\"'" - # don't perform on non-string elements - if not isinstance(string, str): - return string - else: - return string.lstrip(SINGLE_OR_DOUBLE_QUOTES).rstrip( - SINGLE_OR_DOUBLE_QUOTES - ) - - df: pd.DataFrame - for key, df in tables.items(): - - # note: as a ref, no reassign needed - tables[key] = ( - df.rename(columns=df.iloc[0]).drop(df.index[0]).applymap(clean_quotes) - ) - - # ensure all expected subtables present - assert set(tables.keys()) == isa_archive.ISA_INVESTIGATION_HEADERS - - return tables - - def get_assay_table_path( ISAarchive: Path, configuration: dict, return_index: bool = False ) -> Path: @@ -236,7 +177,7 @@ def get_column_name(df: pd.DataFrame, target: Union[str, list]) -> str: # TODO: Needs heavy refactoring and log messaging -def isa_to_runsheet(accession: str, isaArchive: Path, config: Union[tuple[str, str], Path], inject: dict[str, str] = {}, schema: Union[DataFrameSchema, None] = None): +def isa_to_runsheet(accession: str, isaArchive: Path, config: Union[tuple[str, str], Path], inject: dict[str, str] = {}, schema: Union[DataFrameSchema, None] = None, assert_factor_values: bool = True): ################################################################ ################################################################ # SETUP CONFIG AND INPUT TABLES @@ -311,6 +252,7 @@ def isa_to_runsheet(accession: str, isaArchive: Path, config: Union[tuple[str, s ] for entry in assay_source_entries: assert list(df_final.index) == list(df_merged.index) + use_fallback_value = False if entry.get("Runsheet Index"): # already set and checked above continue @@ -403,8 +345,17 @@ def isa_to_runsheet(accession: str, isaArchive: Path, config: Union[tuple[str, s index=df_merged.index, ) else: - target_col = get_column_name(df_merged, entry["ISA Field Name"]) - series_to_add = df_merged[target_col] + try: + target_col = get_column_name(df_merged, entry["ISA Field Name"]) + series_to_add = df_merged[target_col] + except ValueError as e: # Raised when a column is not present + if entry.get("Fallback Value"): + # Create series of same row length as df_merged + series_to_add = pd.Series([entry.get("Fallback Value") for _ in range(len(df_merged))]) + use_fallback_value = True + log.warn(f"Could not find column: {entry['ISA Field Name']}. Using configured fallback value: {entry.get('Fallback Value')}") + else: + raise(e) if entry.get("GLDS URL Mapping"): def map_url_to_filename(fn: str) -> str: try: @@ -418,7 +369,9 @@ def map_url_to_filename(fn: str) -> str: map_url_to_filename ) # inplace operation doesn't seem to work series_to_add = _swap - if entry.get("Remapping"): + if use_fallback_value: + df_final[entry["Runsheet Column Name"]] = entry["Fallback Value"] + elif entry.get("Remapping"): df_final[entry["Runsheet Column Name"]] = series_to_add.map( lambda val: entry.get("Remapping")[val] ) @@ -463,10 +416,11 @@ def map_url_to_filename(fn: str) -> str: runsheet_schema.validate(df_final) - # ensure at least on Factor Value is extracted - assert ( - len([col for col in df_final.columns if col.startswith("Factor Value[")]) != 0 - ), f"Must extract at least one factor value column but only has the following columns: {df_final.columns}" + if assert_factor_values: + # ensure at least on Factor Value is extracted + assert ( + len([col for col in df_final.columns if col.startswith("Factor Value[")]) != 0 + ), f"Must extract at least one factor value column but only has the following columns: {df_final.columns}" ################################################################ ################################################################ diff --git a/dp_tools/scripts/vv_interface.py b/dp_tools/scripts/vv_interface.py index e6c2a19..94ac7dd 100644 --- a/dp_tools/scripts/vv_interface.py +++ b/dp_tools/scripts/vv_interface.py @@ -9,6 +9,7 @@ from dp_tools.core.loaders import load_data from dp_tools.core.check_model import ValidationProtocol, FlagCode, run_manual_check + @click.group() def cli(): pass @@ -18,30 +19,73 @@ def cli(): def validation(): pass + cli.add_command(validation) + @click.command() -@click.option('--output', default="VV_report.tsv", help="Name of report output file", show_default=True) -@click.argument('plug_in_dir') -@click.argument('data_dir') -@click.argument('runsheet_path') -def run(plug_in_dir, output, data_dir, runsheet_path): +@click.option( + "--output", + default="VV_report.tsv", + help="Name of report output file", + show_default=True, +) +@click.argument("plug_in_dir") +@click.argument("data_dir") +@click.argument("runsheet_path") +@click.option( + "--data-asset-key-sets", + type=click.STRING, + default=None, + help="Name of data asset key sets to use. Defaults to use all data asset keys in configuration file.", + show_default=True, +) +@click.option( + "--run-components", + type=click.STRING, + default=None, + help="Name of components to run. Defaults to use all components.", + show_default=True, +) +@click.option( + "--max-flag-code", + type=click.INT, + default=FlagCode.HALT.value, + help="Maximum flag code. If this is exceeded by any check, an error will be raised. Defaults to level associated with FlagCode.HALT.", + show_default=True, +) +def run( + plug_in_dir, + output, + data_dir, + runsheet_path, + data_asset_key_sets, + run_components, + max_flag_code, +): plugin = load_plugin(Path(plug_in_dir)) output = Path(output) data_dir = Path(data_dir) runsheet_path = Path(runsheet_path) - click.echo(f"Running validation protocol and outputting report to file: '{output}'")\ - + click.echo(f"Running validation protocol and outputting report to file: '{output}'") datasystem = load_data( config=plugin.config, root_path=data_dir, runsheet_path=runsheet_path, + key_sets=data_asset_key_sets.split(",") + if data_asset_key_sets is not None + else None, ) vp = plugin.protocol.validate( datasystem.dataset, report_args={"include_skipped": True}, defer_run=True, + protocol_args={ + "run_components": run_components.split(",") + if run_components is not None + else None + }, ) vp.run() @@ -64,8 +108,17 @@ def run(plug_in_dir, output, data_dir, runsheet_path): df.to_csv(output, sep="\t") click.echo(f"Writing results to '{output}'") + # Raise error if any flag code exceeds max_flag_code + flagged_messages = "\n".join( + [msg for msg in df.loc[df["code_level"] >= max_flag_code]["message"]] + ) + assert ( + df["code_level"].max() < max_flag_code + ), f"Maximum flag code exceeded: {max_flag_code}. Printing flag messages that caused this halt: {flagged_messages}" + + @click.command() -@click.argument('validation_report') +@click.argument("validation_report") def manual_checks(validation_report): click.echo(f"Reviewing pending manual checks") validation_report = Path(validation_report) @@ -75,10 +128,12 @@ def manual_checks(validation_report): manual_checks_count = 0 for _, row in df.iterrows(): - if int(row['code_level']) == FlagCode.MANUAL.value: + if int(row["code_level"]) == FlagCode.MANUAL.value: manual_checks_count += 1 - click.echo(f"Found {manual_checks_count} manual checks pending... Starting manual review") + click.echo( + f"Found {manual_checks_count} manual checks pending... Starting manual review" + ) analyst_id = False while not analyst_id: @@ -88,67 +143,101 @@ def manual_checks(validation_report): for _, row in df.iterrows(): logger.debug(f"Processsing: {row}") # Pass through if not manual - if int(row['code_level']) != FlagCode.MANUAL.value: + if int(row["code_level"]) != FlagCode.MANUAL.value: new_rows.append(dict(row)) else: # Manual check - logger.debug(f"""Loading as json string: {row['kwargs'].replace("'",'"')}""") - result = run_manual_check(**json.loads(row['kwargs'].replace("'",'"'))) + logger.debug( + f"""Loading as json string: {row['kwargs'].replace("'",'"')}""" + ) + result = run_manual_check(**json.loads(row["kwargs"].replace("'", '"'))) # replace original manual check notice with filled results row = dict(row) | result - row['kwargs'] = str(json.loads(row['kwargs'].replace("'",'"')) | {"analyst_ID": analyst_id}) + row["kwargs"] = str( + json.loads(row["kwargs"].replace("'", '"')) | {"analyst_ID": analyst_id} + ) new_rows.append(row) - + new_df = pd.DataFrame(new_rows) click.echo("Completed manual checks") output = validation_report.with_suffix("") - new_df.to_csv(output, index = False, sep = "\t") # Remove ".PENDING_MANUAL_CHECKS" + new_df.to_csv(output, index=False, sep="\t") # Remove ".PENDING_MANUAL_CHECKS" click.echo(f"Wrote complete report to '{output}'") @click.command() -@click.option('--output', default="protocol_spec.txt", help="Name of specification output file", show_default=True) -@click.argument('plug_in_dir') -@click.argument('data_dir') -@click.argument('runsheet_path') -def spec(plug_in_dir, output, data_dir, runsheet_path): +@click.option( + "--output", + default="protocol_spec.txt", + help="Name of specification output file", + show_default=True, +) +@click.argument("plug_in_dir") +@click.argument("data_dir") +@click.argument("runsheet_path") +@click.option( + "--data-asset-key-sets", + type=click.STRING, + default=None, + help="Name of data asset key sets to use. Defaults to use all data asset keys in configuration file.", + show_default=True, +) +@click.option( + "--run-components", + type=click.STRING, + default=None, + help="Name of components to run. Defaults to use all components.", + show_default=True, +) +def spec( + plug_in_dir, output, data_dir, runsheet_path, data_asset_key_sets, run_components +): plugin = load_plugin(Path(plug_in_dir)) output = Path(output) data_dir = Path(data_dir) runsheet_path = Path(runsheet_path) - click.echo(f"Generating specification of validation protocol and outputting to file: '{output}'")\ - + click.echo( + f"Generating specification of validation protocol and outputting to file: '{output}'" + ) datasystem = load_data( config=plugin.config, root_path=data_dir, runsheet_path=runsheet_path, + key_sets=data_asset_key_sets.split(",") + if data_asset_key_sets is not None + else None, ) vp = plugin.protocol.validate( datasystem.dataset, report_args={"include_skipped": True}, defer_run=True, + protocol_args={ + "run_components": run_components.split(",") + if run_components is not None + else None + }, ) specification = vp.queued_checks( - long_description = True, - CHECK_PREFIX = "", - COMPONENT_PREFIX = " ", - INDENT_CHAR = "#", - WRAP_COMPONENT_NAME_CHAR = "", - include_checks_counters = False, - include_manual_checks = True - ) + long_description=True, + CHECK_PREFIX="", + COMPONENT_PREFIX=" ", + INDENT_CHAR="#", + WRAP_COMPONENT_NAME_CHAR="", + include_checks_counters=False, + include_manual_checks=True, + ) with open(output, "w") as f: f.write(specification) click.echo(f"Saved specification to {output}") + validation.add_command(run) validation.add_command(manual_checks) validation.add_command(spec) - diff --git a/extract_dataset.py b/extract_dataset.py new file mode 100644 index 0000000..a6860b8 --- /dev/null +++ b/extract_dataset.py @@ -0,0 +1,36 @@ +from pathlib import Path + +import click + +from dp_tools.core.utilites.metrics_extractor import ( + generate_extractor_from_yaml_config, + AssayType, +) + + +CONFIG_YAML = "extraction_settings.yaml" +ISA_PARSE_PATH = "isa_config.yaml" + +@click.command() +@click.option("--osd-id", help='OSD Accession ID. e.g. "OSD-194"', required=True) +def main(osd_id): + ISA_PATH = list(Path.cwd().glob("*ISA*.zip"))[0] + + metricsExtractor = generate_extractor_from_yaml_config(config=CONFIG_YAML) + + metricsExtractor.extract_data_from_isa( + accession=osd_id, + isa_archive=ISA_PATH, + config=Path(ISA_PARSE_PATH), + ) + + # metricsExtractor.append_manual_yaml_data(target_yaml=test_yaml) + + metricsExtractor.extract_sections() + + metricsExtractor.metrics.to_csv(f"{osd_id}_metrics.csv") + + metricsExtractor.process_metrics(assay_type=AssayType.bulkRNASeq).to_csv(f"{osd_id}_summary.csv") + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/set_up_config_files.sh b/set_up_config_files.sh new file mode 100644 index 0000000..b93b21c --- /dev/null +++ b/set_up_config_files.sh @@ -0,0 +1,8 @@ +set -eux + +OUT_DIR=$1 + +cp assets/isa_config.yaml ${OUT_DIR} +cp assets/extraction_conf.yaml ${OUT_DIR}/extraction_settings.yaml + +# cp tests/assets/test.yaml ${OUT_DIR} # Disabled as test.yaml which contains user specified columns is not fully implemented diff --git a/setup.py b/setup.py index 7251c99..021a9fa 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ setup( name="dp_tools", - version="1.3.0", + version="1.3.4", description="Tooling for Data Processing Operations", author="Jonathan Oribello", author_email="jonathan.d.oribello@gmail.com", @@ -23,7 +23,17 @@ }, include_package_data=True, python_requires=">=3.10", - install_requires=["requests", "pyyaml", "pandas==1.4.4", "schema", "tabulate", "multiqc", "pandera", "click", "loguru"], + install_requires=[ + "requests", + "pyyaml", + "pandas==1.4.4", + "schema", + "tabulate", + "multiqc", + "pandera", + "click", + "loguru", + ], setup_requires=[], tests_require=["pytest", "pytest-console_scripts"], entry_points={ diff --git a/tests/assets/config.yaml b/tests/assets/config.yaml new file mode 100644 index 0000000..ad70f8f --- /dev/null +++ b/tests/assets/config.yaml @@ -0,0 +1,129 @@ +Extraction Settings: + root search directory: "/CHANGEME/TO/WHERE/MQC/ARE" + sections: + - name: "raw reads" + enabled: True + multiQC: + from json: + - "raw_multiqc_report" + - "raw_multiqc_data" + - "multiqc_data.json" + search recursively: False + logs directory: + - "00-RawData" + - "FastQC_Reports" + logs pattern(s): + - "*fastqc.zip" + modules: + - "fastqc" + + - name: "trimmed reads" + enabled: True + multiQC: + from json: + - "trimmed_multiqc_report" + - "trimmed_multiqc_data" + - "multiqc_data.json" + search recursively: False + logs directory: + - "01-TG_Preproc" + - "FastQC_Reports" + logs pattern(s): + - "*fastqc.zip" + modules: + - "fastqc" + + - name: "aligned reads" + enabled: True + multiQC: + from json: + - "align_multiqc_report" + - "align_multiqc_data" + - "multiqc_data.json" + search recursively: True + logs directory: + - "02-STAR_Alignment" + logs pattern(s): + - "*Log.final.out" + modules: + - "star" + + - name: "rseqc: genebody coverage" + enabled: True + multiQC: + from json: + - "geneBody_cov_multiqc_report" + - "geneBody_cov_multiqc_data" + - "multiqc_data.json" + search recursively: True + logs directory: + - "RSeQC_Analyses" + - "02_geneBody_coverage" + logs pattern(s): + - "*.geneBodyCoverage.txt" + modules: + - "rseqc" + + - name: "rseqc: infer experiment" + enabled: True + multiQC: + from json: + - "infer_exp_multiqc_report" + - "infer_exp_multiqc_data" + - "multiqc_data.json" + search recursively: True + logs directory: + - "RSeQC_Analyses" + - "03_infer_experiment" + logs pattern(s): + - "*infer_expt.out" + modules: + - "rseqc" + + - name: "rseqc: inner distance" + enabled: True + multiQC: + from json: + - "inner_dist_multiqc_report" + - "inner_dist_multiqc_data" + - "multiqc_data.json" + search recursively: True + logs directory: + - "RSeQC_Analyses" + - "04_inner_distance" + logs pattern(s): + - "*inner_distance.txt" + modules: + - "rseqc" + + - name: "rseqc: read distribution" + enabled: True + multiQC: + from json: + - "read_dist_multiqc_report" + - "read_dist_multiqc_data" + - "multiqc_data.json" + search recursively: True + logs directory: + - "RSeQC_Analyses" + - "05_read_distribution" + logs pattern(s): + - "*read_dist.out" + modules: + - "rseqc" + + + - name: "rsem count" + enabled: True + multiQC: + from json: + - "RSEM_count_multiqc_report" + - "RSEM_count_multiqc_data" + - "multiqc_data.json" + search recursively: True + logs directory: + - "03-RSEM_Counts" + logs pattern(s): + - "*.stat" + modules: + - "rsem" diff --git a/tests/assets/isa_config.yaml b/tests/assets/isa_config.yaml new file mode 100644 index 0000000..f342e52 --- /dev/null +++ b/tests/assets/isa_config.yaml @@ -0,0 +1,1321 @@ +# TOP LEVEL +NAME: "bulkRNASeq" +VERSION: "1" + +# anchors for reuse +_anchors: + rawDataDir: &rawDataDir "00-RawData" + trimDataDir: &trimDataDir "01-TG_Preproc" + alignDataDir: &alignDataDir "02-STAR_Alignment" + countsDataDir: &countsDataDir "03-RSEM_Counts" + normCountsDataDir: &normCountsDataDir "04-DESeq2_NormCounts" + DGEDataDir: &DGEDataDir "05-DESeq2_DGE" + rseqcDataDir: &rseqcDataDir "RSeQC_Analyses" # DISCUSS: Should this be renamed to "RSeQC_Analyses" for consistent casing? -J.O. , this has been renamed and differs from the recent bash based processings + ERCCAnalysisDir: &ERCCAnalysisDir "ERCC_Analysis" + FastQC_Reports: &FastQC_Reports "FastQC_Reports" + neverPublished: &neverPublished + subcategory: null + subdirectory: null + publish to repo: false + include subdirectory in table: false + table order: -1 + +Staging: + General: + Required Metadata: + From ISA: + - ISA Field Name: Study Protocol Type + ISA Table Source: Investigation + Investigation Subtable: STUDY PROTOCOLS + # will return a boolean indicating if any of the following includes + True If Includes At Least One: + - spike-in quality control role + - spike-in protocol + - spike-in control + - spike-in control protocol + Runsheet Column Name: has_ERCC + Processing Usage: >- + Indicates is ERCC spike-in has been added. This can be automatically + determined from the ISA archive as well based on 'Study Protocol Name' and 'Study Protocol Type' + Example: 'TRUE' + + - ISA Field Name: + - Characteristics[Organism] + - Characteristics[organism] + ISA Table Source: Sample + Runsheet Column Name: organism + Processing Usage: >- + Mapping to the appropriate alignment reference and annotation databases. + Example: Arabidopsis thaliana + + - ISA Field Name: + - Characteristics[Material Type] + # - Characteristics[organism] + ISA Table Source: Assay + Runsheet Column Name: Tissue Type + Processing Usage: >- + Used for metrics table + Example: Left retina + + - ISA Field Name: + - Parameter Value[library selection] + # - Characteristics[organism] + ISA Table Source: Sample + Runsheet Column Name: Library Prep Method + Processing Usage: >- + Used for metrics table + Example: Left retina + + - ISA Field Name: Sample Name + ISA Table Source: Assay + Runsheet Column Name: sample_name + Runsheet Index: true + Processing Usage: >- + Sample name is used as a unique sample identifier during processing + Example: Atha_Col-0_Root_WT_Ctrl_45min_Rep1_GSM502538 + + - ISA Field Name: + - Parameter Value[library layout] + - Parameter Value[Library Layout] + ISA Table Source: Assay + Runsheet Column Name: PE or SE + Remapping: {"PAIRED":'PE', "Paired":'PE', "SINGLE":'SE'} + Processing Usage: >- + Used for metrics table + Example: 'PE' + + - ISA Field Name: + - Parameter Value[Stranded] + - Parameter Value[stranded] + ISA Table Source: Assay + Runsheet Column Name: Stranded or Unstranded + # Remapping: {"PAIRED":'PE', "Paired":'PE', "SINGLE":'SE'} + Processing Usage: >- + Used for metrics table + Example: 'STRANDED' + + - ISA Field Name: + - Parameter Value[rRNA Contamination] + # - Parameter Value[Library Layout] + ISA Table Source: Assay + Runsheet Column Name: '% rRNA contamination' + # Append Column Following: "Unit" + Processing Usage: >- + Used for metrics table + Example: '13.212 percent' + + # this entry denotes the following: + # retrive from that ISA field name + # multiple values (separated by ",") + # index those to certain runsheet columns + # if the index doesn't exist, optional prevents raising an exception + # GLDS URL Mapping means the names are searched against the GLDS filelisting json for urls + # an exception will be raised if one and only one url is not mapped to each filename + # - ISA Field Name: + # - Parameter Value[Merged Sequence Data File] + # - Characteristics[Merged Sequence Data File] + # - Raw Data File + # ISA Table Source: Assay + # Multiple Values Per Entry: true + # Multiple Values Delimiter: '\s*,\s*' # whitespace surrounded comma + # Runsheet Column Name: + # - {'name':'read1_path', 'index':0} + # - {'name':'read2_path', 'index':1, 'optional':true} + # GLDS URL Mapping: true + # Processing Usage: >- + # Location to the raw data fastq file. May be a url or local path. + # Example: 'https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-194_rna...' + + # - ISA Field Name: Factor Value[{factor_name}] + # ISA Table Source: [Assay, Sample] + # Runsheet Column Name: Factor Value[{factor_name}] + # Matches Multiple Columns: true + # Match Regex: "Factor Value\\[.*\\]" + # Append Column Following: "Unit" + # Processing Usage: >- + # Factor values in a study. Used to assign experimental groups for each sample. + # Note: On the runsheet, a subsequent 'Unit' Column value will be + # suffix-concatenated if it exists. + # Example: Basal Control + + - ISA Field Name: Unit + ISA Table Source: [Assay, Sample] + Runsheet Column Name: null + Matches Multiple Columns: true + Autoload: false # handled by factor value loading above + Processing Usage: >- + Unit to be suffix-concatenated onto prior Factor value columns. + Example: day + + From User: + # Removed since unused by Processing via the runsheet + # - Runsheet Column Name: GLDS + # Processing Usage: >- + # The GLDS accession number + # Example: GLDS-205 + + - Runsheet Column Name: read1_path + # used to generate candidate file names for searching GLDS repository filelisting + Data Asset Keys: ["raw forward reads fastq GZ", "raw reads fastq GZ"] + Processing Usage: >- + The location of either the forward reads (paired end) or only reads file (single end) + raw fastq file. Can be either a url or local path. + Note: For GLDS raw data assets, either the filelisting json API or the OpenAPI + may be used to retrieve urls given the array data filename (sourced from ISA archive). + Example: /some/local/path OR https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-123_microarray_E-MTAB-3289.raw.1.zip?version=1 + + + - Runsheet Column Name: read2_path + Data Asset Keys: ["raw reverse reads fastq GZ"] + Processing Usage: >- + The location of either the reverse reads (paired end) + raw fastq file. Can be either a url or local path. + For single end studies, this should be an empty string. + Note: For GLDS raw data assets, either the filelisting json API or the OpenAPI + may be used to retrieve urls given the array data filename (sourced from ISA archive). + Example: /some/local/path OR https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-123_microarray_E-MTAB-3289.raw.1.zip?version=1 + +ISA Meta: + Valid Study Assay Technology And Measurement Types: + - measurement: "transcription profiling" + technology: "RNA Sequencing (RNA-Seq)" + + # this is prepended to all file names in the curation assay table + Global file prefix: "{datasystem}_rna_seq_" + + # configuration related to updating investigation file + # each must refer to a STUDY PROCESS in the 'ISA_investigation.yaml' file + # LEADCAP_organism should be the studied organisms scientific name with a leading cap + Post Processing Add Study Protocol: + GeneLab RNAseq data processing protocol::{LEADCAP_organism} V1 + +data assets: + runsheet: + processed location: + - "Metadata" + - "{dataset}_bulkRNASeq_v1_runsheet.csv" + + tags: + - raw + + resource categories: *neverPublished + + ISA Archive: + processed location: + - "Metadata" + - "*-ISA.zip" + + tags: + - raw + + resource categories: *neverPublished + + raw MultiQC directory: + processed location: + - *rawDataDir + - *FastQC_Reports + - "raw_multiqc_report" + + tags: + - raw + + resource categories: *neverPublished + + raw MultiQC directory ZIP: + processed location: + - *rawDataDir + - *FastQC_Reports + - "raw_multiqc_report.zip" + + tags: + - raw + + resource categories: &MergedSequenceData_MultiQCReports + subcategory: Merged Sequence Data + subdirectory: Multiqc Reports + publish to repo: true + include subdirectory in table: true + table order: 1 + + raw forward reads fastq GZ: + processed location: + - *rawDataDir + - "Fastq" + - "{sample}_R1_raw.fastq.gz" + + tags: + - raw + + resource categories: &MergedSequenceData_Fastq + subcategory: Merged Sequence Data + subdirectory: Fastq + publish to repo: true + include subdirectory in table: false + table order: 0 + + raw reverse reads fastq GZ: + processed location: + - *rawDataDir + - "Fastq" + - "{sample}_R2_raw.fastq.gz" + + tags: + - raw + + resource categories: *MergedSequenceData_Fastq + + raw reads fastq GZ: + processed location: + - *rawDataDir + - "Fastq" + - "{sample}_raw.fastq.gz" + + tags: + - raw + + resource categories: *MergedSequenceData_Fastq + + raw forward reads fastQC HTML: + processed location: + - *rawDataDir + - *FastQC_Reports + - "{sample}_R1_raw_fastqc.html" + + tags: + - raw + + resource categories: *neverPublished + + # J.Oribello: We should revisit this, fastQC includes some unique (not parsed + # into multiQC) relevant information like the actual overrepresented sequence strings + raw reverse reads fastQC HTML: + processed location: + - *rawDataDir + - *FastQC_Reports + - "{sample}_R2_raw_fastqc.html" + + tags: + - raw + + resource categories: *neverPublished + + raw reads fastQC HTML: + processed location: + - *rawDataDir + - *FastQC_Reports + - "{sample}_raw_fastqc.html" + + tags: + - raw + + resource categories: *neverPublished + + raw forward reads fastQC ZIP: + processed location: + - *rawDataDir + - *FastQC_Reports + - "{sample}_R1_raw_fastqc.zip" + + tags: + - raw + + resource categories: *neverPublished + + raw reverse reads fastQC ZIP: + processed location: + - *rawDataDir + - *FastQC_Reports + - "{sample}_R2_raw_fastqc.zip" + + tags: + - raw + + resource categories: *neverPublished + + raw reads fastQC ZIP: + processed location: + - *rawDataDir + - *FastQC_Reports + - "{sample}_raw_fastqc.zip" + + tags: + - raw + + resource categories: *neverPublished + + trimmed fastQC MultiQC directory: + processed location: + - *trimDataDir + - *FastQC_Reports + - "trimmed_multiqc_report" + + tags: + - processed + + resource categories: *neverPublished + + trimmed fastQC MultiQC directory ZIP: + processed location: + - *trimDataDir + - *FastQC_Reports + - "trimmed_multiqc_report.zip" + + tags: + - processed + + resource categories: &TrimmedSequenceData_MultiQCReports + subcategory: Trimmed Sequence Data + subdirectory: Multiqc Reports + publish to repo: true + include subdirectory in table: true + table order: 4 + + trimmed forward reads fastq GZ: &trimmedFastqGZ + processed location: + - *trimDataDir + - "Fastq" + - "{sample}_R1_trimmed.fastq.gz" + + tags: + - processed + + resource categories: + subcategory: Trimmed Sequence Data + subdirectory: Fastq + publish to repo: true + include subdirectory in table: false + table order: 3 + + trimmed reverse reads fastq GZ: + <<: *trimmedFastqGZ + processed location: + - *trimDataDir + - "Fastq" + - "{sample}_R2_trimmed.fastq.gz" + + tags: + - processed + + trimmed reads fastq GZ: + <<: *trimmedFastqGZ + processed location: + - *trimDataDir + - "Fastq" + - "{sample}_trimmed.fastq.gz" + + tags: + - processed + + trimmed forward reads fastQC HTML: &trimmedForwardReadsFastQCHTML + processed location: + - *trimDataDir + - *FastQC_Reports + - "{sample}_R1_trimmed_fastqc.html" + + tags: + - processed + + resource categories: *neverPublished + + trimmed reverse reads fastQC HTML: + <<: *trimmedForwardReadsFastQCHTML + processed location: + - *trimDataDir + - *FastQC_Reports + - "{sample}_R2_trimmed_fastqc.html" + + tags: + - processed + + trimmed reads fastQC HTML: + <<: *trimmedForwardReadsFastQCHTML + processed location: + - *trimDataDir + - *FastQC_Reports + - "{sample}_trimmed_fastqc.html" + + tags: + - processed + + trimmed forward reads fastQC ZIP: &trimmedForwardReadsFastQCZIP + processed location: + - *trimDataDir + - *FastQC_Reports + - "{sample}_R1_trimmed_fastqc.zip" + + tags: + - processed + + resource categories: *neverPublished + + trimmed reverse reads fastQC ZIP: + <<: *trimmedForwardReadsFastQCZIP + processed location: + - *trimDataDir + - *FastQC_Reports + - "{sample}_R2_trimmed_fastqc.zip" + + tags: + - processed + + trimmed reads fastQC ZIP: + <<: *trimmedForwardReadsFastQCZIP + processed location: + - *trimDataDir + - *FastQC_Reports + - "{sample}_trimmed_fastqc.zip" + + tags: + - processed + + trimming MultiQC directory: + processed location: + - *trimDataDir + - &trimmingReportsDir "Trimming_Reports" + - "trimming_multiqc_report" + + tags: + - processed + + resource categories: *neverPublished + + forward reads trimming report: &trimmedForwardReadsFastQCTrimmingReport + processed location: + - *trimDataDir + - *trimmingReportsDir + - "{sample}_R1_raw.fastq.gz_trimming_report.txt" + + tags: + - processed + + resource categories: + subcategory: Trimmed Sequence Data + subdirectory: Trimming Reports + publish to repo: true + include subdirectory in table: true + table order: 5 + + reverse reads trimming report: + <<: *trimmedForwardReadsFastQCTrimmingReport + processed location: + - *trimDataDir + - *trimmingReportsDir + - "{sample}_R2_raw.fastq.gz_trimming_report.txt" + + tags: + - processed + + reads trimming report: + <<: *trimmedForwardReadsFastQCTrimmingReport + processed location: + - *trimDataDir + - *trimmingReportsDir + - "{sample}_raw.fastq.gz_trimming_report.txt" + + tags: + - processed + + aligned MultiQC directory: + processed location: + - *alignDataDir + - "align_multiqc_report" + + resource categories: *neverPublished + + tags: + - processed + + aligned MultiQC directory ZIP: + processed location: + - *alignDataDir + - "align_multiqc_report.zip" + + tags: + - processed + + resource categories: &AlignedSequenceData_MultiQCReports + subcategory: Aligned Sequence Data # RENAME: from 'Aligned sequence data'. For consistency with Title casing across the board + subdirectory: MultiQC Reports # RENAME: from 'MultiQC Reports'. For consistency with Title casing across the board + publish to repo: true + include subdirectory in table: true + table order: 8 + + aligned ToTranscriptome Bam: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Aligned.toTranscriptome.out.bam" + + tags: + - processed + + resource categories: &AlignedSequenceData_AlignedData + subcategory: Aligned Sequence Data + subdirectory: Aligned Data + publish to repo: true + include subdirectory in table: false + table order: 6 + + aligned SortedByCoord Bam: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Aligned.sortedByCoord.out.bam" + + tags: + - processed + + resource categories: *neverPublished + + aligned SortedByCoord ResortedBam: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Aligned.sortedByCoord_sorted.out.bam" + + tags: + - processed + + resource categories: *AlignedSequenceData_AlignedData + + aligned SortedByCoord ResortedBamIndex: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Aligned.sortedByCoord_sorted.out.bam.bai" + + tags: + - processed + + resource categories: *AlignedSequenceData_AlignedData + + aligned log Final: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Log.final.out" + + tags: + - processed + + resource categories: &AlignedSequenceData_AlignmentLogs + subcategory: Aligned Sequence Data + subdirectory: Alignment Logs + publish to repo: true + include subdirectory in table: true + table order: 7 + + aligned log Progress: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Log.progress.out" + + tags: + - processed + + resource categories: *neverPublished + + aligned log Full: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Log.out" + + tags: + - processed + + resource categories: *neverPublished + + aligned sjTab: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_SJ.out.tab" + + tags: + - processed + + resource categories: *AlignedSequenceData_AlignedData + + genebody coverage MultiQC directory: + processed location: + - *rseqcDataDir + - "02_geneBody_coverage" + - "geneBody_cov_multiqc_report" + + tags: + - processed + + resource categories: *neverPublished + + genebody coverage MultiQC directory ZIP: + processed location: + - *rseqcDataDir + - "02_geneBody_coverage" + - "geneBody_cov_multiqc_report.zip" + + tags: + - processed + + resource categories: &RSeQC_MultiQCReports + subcategory: RSeQC + subdirectory: MultiQC Reports + publish to repo: true + include subdirectory in table: true + table order: 9 + + infer experiment MultiQC directory: + processed location: + - *rseqcDataDir + - "03_infer_experiment" + - "infer_exp_multiqc_report" + + tags: + - processed + + resource categories: *neverPublished + + infer experiment MultiQC directory ZIP: + processed location: + - *rseqcDataDir + - "03_infer_experiment" + - "infer_exp_multiqc_report.zip" + + tags: + - processed + + resource categories: *RSeQC_MultiQCReports + + inner distance MultiQC directory: + processed location: + - *rseqcDataDir + - "04_inner_distance" + - "inner_dist_multiqc_report" + + tags: + - processed + + resource categories: *neverPublished + + inner distance MultiQC directory ZIP: + processed location: + - *rseqcDataDir + - "04_inner_distance" + - "inner_dist_multiqc_report.zip" + + tags: + - processed + + resource categories: *RSeQC_MultiQCReports + + read distribution MultiQC directory: + processed location: + - *rseqcDataDir + - "05_read_distribution" + - "read_dist_multiqc_report" + + tags: + - processed + + resource categories: *neverPublished + + read distribution MultiQC directory ZIP: + processed location: + - *rseqcDataDir + - "05_read_distribution" + - "read_dist_multiqc_report.zip" + + tags: + - processed + + resource categories: *RSeQC_MultiQCReports + + genebody coverage out: + processed location: + - *rseqcDataDir + - "02_geneBody_coverage" + - "{sample}" + + tags: + - processed + + # TODO: DISCUSS Consider this for directories that are handled the same but should validate contents + # is directory: true + # contents: + # - ["{sample}.geneBodyCoverage.r"] + # - ["{sample}.geneBodyCoverage.txt"] + # - ["{sample}.geneBodyCoverage.curves.pdf"] + + resource categories: *neverPublished + + infer experiment out: + processed location: + - *rseqcDataDir + - "03_infer_experiment" + - "{sample}_infer_expt.out" + + tags: + - processed + + resource categories: *neverPublished + + inner distance out: + processed location: + - *rseqcDataDir + - "04_inner_distance" + - "{sample}" + + tags: + - processed + + resource categories: *neverPublished + + read distribution out: + processed location: + - *rseqcDataDir + - "05_read_distribution" + - "{sample}_read_dist.out" + + tags: + - processed + + resource categories: *neverPublished + + RSEM counts MultiQC directory: + processed location: + - *countsDataDir + - "RSEM_count_multiqc_report" # RENAMED from count_multiqc_report as of 4/14/2022 + + tags: + - processed + + resource categories: *neverPublished + + RSEM counts MultiQC directory ZIP: + processed location: + - *countsDataDir + - "RSEM_count_multiqc_report.zip" + + tags: + - processed + + resource categories: &RawCountsData_MultiQCReports + subcategory: Raw Counts Data + subdirectory: Multiqc Reports + publish to repo: true + include subdirectory in table: true + table order: 11 + + star number non-zero count genes table: + processed location: + - *alignDataDir + - "STAR_NumNonZeroGenes.csv" + + tags: + - processed + + resource categories: *neverPublished + + star unnormalized counts table: + processed location: + - *alignDataDir + - "STAR_Unnormalized_Counts.csv" + + tags: + - processed + + resource categories: &RawCountsTables + subcategory: Raw Counts Tables + subdirectory: "" + publish to repo: true + include subdirectory in table: false + table order: 12 + + rsem number non-zero count genes table: + processed location: + - *countsDataDir + - "RSEM_NumNonZeroGenes.csv" + + tags: + - processed + + resource categories: *neverPublished + + rsem unnormalized counts table: + processed location: + - *countsDataDir + - "RSEM_Unnormalized_Counts.csv" # RENAMED from 'Unnormalized_Counts.csv' + + tags: + - processed + + resource categories: *RawCountsTables + + sample reads per gene table: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_ReadsPerGene.out.tab" + + tags: + - processed + + resource categories: *neverPublished # TODO: Discuss, should this be repo published? In what way? + + sample gene counts table: + processed location: + - *countsDataDir + # Removed - "{sample}", DISCUSS: Since this directory contains multiple files per sample, should this be nested in sample-wise dirs consistent with STAR and RSeQC. J.O. + - "{sample}.genes.results" + + tags: + - processed + + resource categories: &RawCountsData_CountData + subcategory: Raw Counts Data + subdirectory: Count Data + publish to repo: true + include subdirectory in table: false + table order: 10 + + sample isoform counts table: + processed location: + - *countsDataDir + # Removed - "{sample}", DISCUSS: Since this directory contains multiple files per sample, should this be nested in sample-wise dirs consistent with STAR and RSeQC. J.O. + - "{sample}.isoforms.results" + + tags: + - processed + + resource categories: *RawCountsData_CountData + + sample counts stats directory: + processed location: + - *countsDataDir + # Removed - "{sample}", DISCUSS: Since this directory contains multiple files per sample, should this be nested in sample-wise dirs consistent with STAR and RSeQC. J.O. + - "{sample}.stat" + + tags: + - processed + + resource categories: *neverPublished + + DESeq2 normalized counts table: + processed location: + - *normCountsDataDir + - "Normalized_Counts.csv" + + tags: + - processed + + resource categories: &normalizedCountsData + subcategory: Normalized Counts Data + subdirectory: "" + publish to repo: true + include subdirectory in table: false + table order: 13 + + ERCC normalized DESeq2 normalized counts table: + processed location: + - *normCountsDataDir + - "ERCC_Normalized_Counts.csv" + + tags: + - processed + + resource categories: *normalizedCountsData + + sample table: + processed location: + - *DGEDataDir + - "SampleTable.csv" + + tags: + - processed + + resource categories: &DGEAnalysisData + subcategory: Differential Expression Analysis Data + subdirectory: "" + publish to repo: true + include subdirectory in table: false + table order: 14 + + ERCC sample table: + processed location: + - *DGEDataDir + - &erccSubDir "ERCC_NormDGE" + - "ERCCnorm_SampleTable.csv" + + tags: + - processed + + resource categories: *DGEAnalysisData + + DESeq2 unnormalized counts table: + processed location: + - *normCountsDataDir + - "RSEM_Unnormalized_Counts.csv" # RENAMED: from "Unnormalized_Counts.csv" for clarity + + tags: + - processed + + resource categories: *neverPublished # DISCUSS: temporary name clash resolution for publishables + + DESeq2 contrasts table: + processed location: + - *DGEDataDir + - "contrasts.csv" + + tags: + - processed + + resource categories: *DGEAnalysisData + + ERCC normalized DESeq2 contrasts table: + processed location: + - *DGEDataDir + - *erccSubDir + - "ERCCnorm_contrasts.csv" + + tags: + - processed + + resource categories: *DGEAnalysisData + + DESeq2 annotated DGE table: + processed location: + - *DGEDataDir + - "differential_expression.csv" + + tags: + - processed + + resource categories: *DGEAnalysisData + + ERCC normalized DESeq2 annotated DGE table: + processed location: + - *DGEDataDir + - *erccSubDir + - "ERCCnorm_differential_expression.csv" + + tags: + - processed + + resource categories: *DGEAnalysisData + + DESeq2 annotated DGE extended for viz table: + processed location: + - *DGEDataDir + - "visualization_output_table.csv" + + tags: + - processed + + resource categories: *neverPublished + + ERCC normalized DESeq2 annotated DGE extended for viz table: + processed location: + - *DGEDataDir + - *erccSubDir + - "visualization_output_table_ERCCnorm.csv" + + tags: + - processed + + resource categories: *neverPublished + + DESeq2 viz PCA table: + processed location: + - *DGEDataDir + - "visualization_PCA_table.csv" + + tags: + - processed + + resource categories: *neverPublished + + ERCC normalized DESeq2 viz PCA table: + processed location: + - *DGEDataDir + - *erccSubDir + - "visualization_PCA_table_ERCCnorm.csv" + + tags: + - processed + + resource categories: *neverPublished + + + ERCC analysis HTML: + processed location: + - *ERCCAnalysisDir + - "ERCC_analysis.html" + + tags: + - processed + + conditional on dataset: + - has_ERCC: [True] + + resource categories: + subcategory: ERCC Analyses + subdirectory: "" + publish to repo: true + include subdirectory in table: false + table order: 15 + + # NOTE: this is while the ERCC analysis sits outside the full pipeline and + # once incoporated, it should be validated for existence! + validate exists: false + +# Assets that are no longer generated by the latest pipeline +Archived Data Assets: + + # DISCUSS: When Trim Galore MQC if made clearer, publishing this should be revisited + # Currently this only reports the direct cutadapt related trimming and misses Trim-Galore + # Specific metrics. + # - Jonathan Oribello + trimming MultiQC directory ZIP: + processed location: + - *trimDataDir + - *trimmingReportsDir + - "trimming_multiqc_report.zip" + + tags: + - processed + + resource categories: *neverPublished + + +data asset sets: + # These assets are not generated in the workflow, but are generated after the workflow + PUTATIVE: + - "ERCC analysis HTML" + glds metadata: + - "ISA Archive" + has ercc: + - "ERCC normalized DESeq2 normalized counts table" + - "ERCC sample table" + - "ERCC normalized DESeq2 contrasts table" + - "ERCC normalized DESeq2 annotated DGE table" + - "ERCC normalized DESeq2 annotated DGE extended for viz table" + - "ERCC normalized DESeq2 viz PCA table" + # NOTE: Not part of NF_WF yet - "ERCC analysis HTML" + demuliplexed paired end raw data: + - "runsheet" + - "raw forward reads fastq GZ" + - "raw reverse reads fastq GZ" + qc reports for paired end raw data: + - "raw forward reads fastQC HTML" + - "raw reverse reads fastQC HTML" + - "raw forward reads fastQC ZIP" + - "raw reverse reads fastQC ZIP" + - "raw MultiQC directory" + - "raw MultiQC directory ZIP" + paired end trimmed reads: + - "trimmed forward reads fastq GZ" + - "trimmed reverse reads fastq GZ" + qc reports for paired end trimmed reads data: + - "trimmed forward reads fastQC HTML" + - "trimmed reverse reads fastQC HTML" + - "trimmed forward reads fastQC ZIP" + - "trimmed reverse reads fastQC ZIP" + - "trimmed fastQC MultiQC directory" + - "trimming MultiQC directory" + - "forward reads trimming report" + - "reverse reads trimming report" + demuliplexed single end raw data: + - "runsheet" + - "raw reads fastq GZ" + qc reports for single end raw data: + - "raw reads fastQC HTML" + - "raw reads fastQC ZIP" + - "raw MultiQC directory" + - "raw MultiQC directory ZIP" + single end trimmed reads: + - "trimmed reads fastq GZ" + qc reports for single end trimmed reads data: + - "trimmed reads fastQC HTML" + - "trimmed reads fastQC ZIP" + - "trimmed fastQC MultiQC directory" + - "trimming MultiQC directory" + - "reads trimming report" + STAR alignments: + - "aligned MultiQC directory" + - "aligned MultiQC directory ZIP" + - "aligned ToTranscriptome Bam" + - "aligned SortedByCoord Bam" + - "aligned SortedByCoord ResortedBam" + - "aligned SortedByCoord ResortedBamIndex" + - "aligned log Final" + - "aligned log Progress" + - "aligned log Full" + - "aligned sjTab" + - "sample reads per gene table" + - "star number non-zero count genes table" + - "star unnormalized counts table" + RSeQC output for paired end data: + - "genebody coverage MultiQC directory" + - "genebody coverage MultiQC directory ZIP" + - "infer experiment MultiQC directory" + - "infer experiment MultiQC directory ZIP" + - "inner distance MultiQC directory" + - "inner distance MultiQC directory ZIP" + - "read distribution MultiQC directory" + - "read distribution MultiQC directory ZIP" + - "genebody coverage out" + - "infer experiment out" + - "inner distance out" + - "read distribution out" + RSeQC output for single end data: + - "genebody coverage MultiQC directory" + - "genebody coverage MultiQC directory ZIP" + - "infer experiment MultiQC directory" + - "infer experiment MultiQC directory ZIP" + - "read distribution MultiQC directory" + - "read distribution MultiQC directory ZIP" + - "genebody coverage out" + - "infer experiment out" + - "read distribution out" + RSEM counts: + - "RSEM counts MultiQC directory" + - "RSEM counts MultiQC directory ZIP" + - "rsem number non-zero count genes table" + - "rsem unnormalized counts table" + - "sample gene counts table" + - "sample isoform counts table" + - "sample counts stats directory" + is single end full: + - "runsheet" + - "ISA Archive" + - "raw MultiQC directory" + - "raw MultiQC directory ZIP" + - "raw reads fastq GZ" + - "raw reads fastQC HTML" + - "raw reads fastQC ZIP" + - "trimmed fastQC MultiQC directory" + - "trimmed fastQC MultiQC directory ZIP" + - "trimmed reads fastq GZ" + - "trimmed reads fastQC HTML" + - "trimmed reads fastQC ZIP" + - "trimming MultiQC directory" + - "reads trimming report" + - "aligned MultiQC directory" + - "aligned MultiQC directory ZIP" + - "aligned ToTranscriptome Bam" + - "aligned SortedByCoord Bam" + - "aligned SortedByCoord ResortedBam" + - "aligned SortedByCoord ResortedBamIndex" + - "aligned log Final" + - "aligned log Progress" + - "aligned log Full" + - "aligned sjTab" + - "genebody coverage MultiQC directory" + - "genebody coverage MultiQC directory ZIP" + - "infer experiment MultiQC directory" + - "infer experiment MultiQC directory ZIP" + - "read distribution MultiQC directory" + - "read distribution MultiQC directory ZIP" + - "genebody coverage out" + - "infer experiment out" + - "read distribution out" + - "RSEM counts MultiQC directory" + - "RSEM counts MultiQC directory ZIP" + - "star number non-zero count genes table" + - "star unnormalized counts table" + - "rsem number non-zero count genes table" + - "rsem unnormalized counts table" + - "sample reads per gene table" + - "sample gene counts table" + - "sample isoform counts table" + - "sample counts stats directory" + - "DESeq2 normalized counts table" + - "sample table" + - "DESeq2 unnormalized counts table" + - "DESeq2 contrasts table" + - "DESeq2 annotated DGE table" + - "DESeq2 annotated DGE extended for viz table" + - "DESeq2 viz PCA table" + is paired end full: + - "runsheet" + - "ISA Archive" + - "raw MultiQC directory" + - "raw MultiQC directory ZIP" + - "raw forward reads fastq GZ" + - "raw reverse reads fastq GZ" + - "raw forward reads fastQC HTML" + - "raw reverse reads fastQC HTML" + - "raw forward reads fastQC ZIP" + - "raw reverse reads fastQC ZIP" + - "trimmed fastQC MultiQC directory" + - "trimmed fastQC MultiQC directory ZIP" + - "trimmed forward reads fastq GZ" + - "trimmed reverse reads fastq GZ" + - "trimmed forward reads fastQC HTML" + - "trimmed reverse reads fastQC HTML" + - "trimmed forward reads fastQC ZIP" + - "trimmed reverse reads fastQC ZIP" + - "trimming MultiQC directory" + - "forward reads trimming report" + - "reverse reads trimming report" + - "aligned MultiQC directory" + - "aligned MultiQC directory ZIP" + - "aligned ToTranscriptome Bam" + - "aligned SortedByCoord Bam" + - "aligned SortedByCoord ResortedBam" + - "aligned SortedByCoord ResortedBamIndex" + - "aligned log Final" + - "aligned log Progress" + - "aligned log Full" + - "aligned sjTab" + - "genebody coverage MultiQC directory" + - "genebody coverage MultiQC directory ZIP" + - "infer experiment MultiQC directory" + - "infer experiment MultiQC directory ZIP" + - "inner distance MultiQC directory" + - "inner distance MultiQC directory ZIP" + - "read distribution MultiQC directory" + - "read distribution MultiQC directory ZIP" + - "genebody coverage out" + - "infer experiment out" + - "inner distance out" + - "read distribution out" + - "RSEM counts MultiQC directory" + - "RSEM counts MultiQC directory ZIP" + - "star number non-zero count genes table" + - "star unnormalized counts table" + - "rsem number non-zero count genes table" + - "rsem unnormalized counts table" + - "sample reads per gene table" + - "sample gene counts table" + - "sample isoform counts table" + - "sample counts stats directory" + - "DESeq2 normalized counts table" + - "sample table" + - "DESeq2 unnormalized counts table" + - "DESeq2 contrasts table" + - "DESeq2 annotated DGE table" + - "DESeq2 annotated DGE extended for viz table" + - "DESeq2 viz PCA table" + DGE Output: + - "DESeq2 normalized counts table" + - "sample table" + - "DESeq2 unnormalized counts table" + - "DESeq2 contrasts table" + - "DESeq2 annotated DGE table" + - "DESeq2 annotated DGE extended for viz table" + - "DESeq2 viz PCA table" + ERCC DGE Output: + - "ERCC normalized DESeq2 normalized counts table" + - "ERCC sample table" + - "ERCC normalized DESeq2 contrasts table" + - "ERCC normalized DESeq2 annotated DGE table" + - "ERCC normalized DESeq2 annotated DGE extended for viz table" + - "ERCC normalized DESeq2 viz PCA table" + # NOTE: Not part of NF_WF yet - "ERCC analysis HTML" + RSEM Output: + - "RSEM counts MultiQC directory" + - "RSEM counts MultiQC directory ZIP" + - "rsem number non-zero count genes table" + - "rsem unnormalized counts table" \ No newline at end of file diff --git a/tests/assets/test.yaml b/tests/assets/test.yaml new file mode 100644 index 0000000..31af719 --- /dev/null +++ b/tests/assets/test.yaml @@ -0,0 +1,3 @@ +'OSD-#': 194 +'GLDS-#': 194 +'Data Source': MANUAL diff --git a/tests/test_metrics_extractions.py b/tests/test_metrics_extractions.py new file mode 100644 index 0000000..7f570ed --- /dev/null +++ b/tests/test_metrics_extractions.py @@ -0,0 +1,86 @@ +from pathlib import Path + +import pytest + +from dp_tools.core.utilites.metrics_extractor import ( + generate_extractor_from_yaml_config, + AssayType, +) + + +@pytest.fixture +def test_yaml(): + # Make the path relative to this file + TEST_DIR = Path(__file__).parent + return TEST_DIR / "assets/test.yaml" + + +@pytest.fixture +def configuration_yaml(): + # Make the path relative to this file + TEST_DIR = Path(__file__).parent + return TEST_DIR / "assets/config.yaml" + + +@pytest.fixture +def OSD_576_metrics_csv(): + TEST_DIR = Path(__file__).parent + return TEST_DIR / "assets/OSD-576_on_cluster_metrics.csv" + + +@pytest.fixture +def OSD_281_metrics_csv(): + TEST_DIR = Path(__file__).parent + return TEST_DIR / "GLDS-281_on_cluster_metrics.csv" + + +def test_extract_general_information(test_yaml): + MetricsExtractor().extract_general_information(assay_type=1, yaml_file=test_yaml) + pass + + +def test_isa_to_yaml(glds194_test_dir, test_yaml, configuration_yaml): + metricsExtractor = generate_extractor_from_yaml_config(config=configuration_yaml) + + metricsExtractor.extract_data_from_isa( + accession="GLDS-194", + isa_archive=glds194_test_dir / "Metadata/GLDS-194_metadata_GLDS-194-ISA.zip", + config=Path("/workspace/metrics_bulkRNASeq.yaml"), + ) + + metricsExtractor.append_manual_yaml_data(target_yaml=test_yaml) + + metricsExtractor.extract_sections() + + assert ( + set( + [ + "has_ERCC", + "organism", + "Tissue Type", + "Library Prep Method", + "PE or SE", + "Stranded or Unstranded", + "% rRNA contamination", + "Original Sample Name", + "OSD-#", + "GLDS-#", + "Data Source", + ] + ).difference(set(metricsExtractor.metrics)) + == set() + ) + + metricsExtractor.metrics.to_csv(glds194_test_dir / "test.csv") + + metricsExtractor.process_metrics(assay_type=AssayType.bulkRNASeq) + + +def test_load_and_process_metrics_table(configuration_yaml, OSD_576_metrics_csv): + metricsExtractor = generate_extractor_from_yaml_config(config=configuration_yaml) + + metricsExtractor.load_metrics_csv(metrics_csv=OSD_576_metrics_csv) + + metricsExtractor.process_metrics(assay_type=AssayType.bulkRNASeq).to_csv( + "/workspace/OSD_576_metrics_summary.csv" + )