diff --git a/.github/workflows/run_pytests.yml b/.github/workflows/run_pytests.yml
index 7b3d56f..965b212 100644
--- a/.github/workflows/run_pytests.yml
+++ b/.github/workflows/run_pytests.yml
@@ -7,6 +7,7 @@ on:
     branches:
       - main
       - development
+      - "*hotfix*"
   pull_request:
     types: [ opened, synchronize] 
   # Allows you to run this workflow manually from the Actions tab
diff --git a/CHANGELOG.md b/CHANGELOG.md
index c29db2b..528ef67 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,31 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.3.4]
+
+### Changed
+
+- Table updates (associated with updating ISA archive files) now separates multiple files in a field with ',' instead of ', '
+
+## [1.3.3]
+
+### Added
+
+- Support for data asset key sets and run components in updated validation interface (i.e. by 'dpt validation')
+
+## [1.3.2]
+
+### Fixed
+
+- Refactored ISA archive parsing functions as prior the fallback wasn't being used in all calls (specifically the plug in based ones)
+
+## [1.3.1]
+
+### Fixed
+
+- Parsing for ISA Archives met 'ISO-8859-1' encoding but not 'utf-8'
+  - Specifically, 'utf-8' is attempted and 'ISO-8859-1' is used as a fallback
+
 ## [1.3.0]
 
 ### Added
@@ -170,3 +195,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 [1.2.0]: https://github.com/j-81/dp_tools/compare/1.1.9...1.2.0
 [1.2.1]: https://github.com/j-81/dp_tools/compare/1.2.0...1.2.1
 [1.3.0]: https://github.com/j-81/dp_tools/compare/1.2.1...1.3.0
+[1.3.1]: https://github.com/j-81/dp_tools/compare/1.3.0...1.3.1
+[1.3.2]: https://github.com/j-81/dp_tools/compare/1.3.1...1.3.2
+[1.3.3]: https://github.com/j-81/dp_tools/compare/1.3.2...1.3.3
+[1.3.4]: https://github.com/j-81/dp_tools/compare/1.3.3...1.3.4
diff --git a/INSTRUCTIONS.md b/INSTRUCTIONS.md
new file mode 100644
index 0000000..1c304b2
--- /dev/null
+++ b/INSTRUCTIONS.md
@@ -0,0 +1,60 @@
+# This document explains usage from Gitpod; however, beside installation, these may (untested) also work when running from containers (wrapped in appropriate `singularity` or `docker` invocations)
+
+
+## Installation
+
+```
+cd $REPO_DIRECTORY # e.g. /workspace/dp_tools in gitpod
+pip install -e .
+```
+
+## Download Relevant MultiQC & ISA archive
+
+> python download_multiqc_from_OSD.py --osd-id <OSD-NNN> --output-dir <OUTPUT_DIR>
+
+### Known Limitations / Issues
+
+* Only supports datasets with `read distribution` MultiQC files (used as a proxy for whether the dataset is actually sequencing  transcriptomics)
+** Future: Should rely on parsing metadata from API
+
+## Copy required configuration files
+
+> bash set_up_config_files.sh <OUTPUT_DIR>
+
+This copies template yaml files from the repository code.
+
+## CD into directory
+
+> cd <OUTPUT_DIR>
+
+## Modify configuration files
+
+
+### isa_config.yaml
+
+1. Initially, no changes
+2. If encountering error like: `ValueError: Could not find required column '['Parameter Value[Stranded]', 'Parameter Value[stranded]']' in either ISA sample or assay table.` 
+  * Comment out or modify item in `Staging: ->  General: -> Required Metadata: -> From ISA:` section of yaml
+
+### extraction_settings.yaml
+
+1. MUST: change root search directory (line 2) to directory containing multiQC reports generated at start of this document
+1. MAY: need to disable section for certain multiQC (not likely useful / will very probably break summarization)
+
+## Run extract & summarize script
+
+> python ../extract_dataset.py --osd-id <OSD_NNN> # You should still be in the directory with the multiQC outputs & yaml files
+
+Outputs:
+
+1. <OSD_NNN>_metrics.csv # Exhaustive metrics as pulled from multiQC reports
+2. <OSD_NNN>_summary.csv # Summarization and derived statistics as generated on the exhaustive metrics table
+
+
+## Overall Known Limitations
+
+* Currently only supports paired end sequencing transcriptomics
+   * Updating this will require updating both extraction & summarization code
+
+* Certain ISA archives may not work
+   * While most missing or encoded off-spec metadata can be addressed by disabling (commenting out) sections in `extraction_settings.yaml`, certain ones like missing `library layout` (unlikley but an example) will likely require more significant changes to accomodate.
diff --git a/OSD-201/extraction_settings.yaml b/OSD-201/extraction_settings.yaml
new file mode 100644
index 0000000..e55a71c
--- /dev/null
+++ b/OSD-201/extraction_settings.yaml
@@ -0,0 +1,130 @@
+Extraction Settings:
+  root search directory: "/workspace/dp_tools/OSD-201"
+  sections:
+    - name: "raw reads" 
+      enabled: True
+      multiQC:
+        from json:
+          - "raw_multiqc_report"
+          - "raw_multiqc_data"
+          - "multiqc_data.json"
+        search recursively: False
+        logs directory: 
+          - "00-RawData"
+          - "FastQC_Reports"
+        logs pattern(s):
+          - "*fastqc.zip"
+        modules:
+          - "fastqc"
+
+    - name: "trimmed reads"
+      enabled: True
+      multiQC:
+        from json:
+          - "trimmed_multiqc_report"
+          - "trimmed_multiqc_data"
+          - "multiqc_data.json"
+        search recursively: False
+        logs directory: 
+          - "01-TG_Preproc"
+          - "FastQC_Reports"
+        logs pattern(s):
+          - "*fastqc.zip"
+        modules:
+          - "fastqc"
+
+    - name: "aligned reads"
+      enabled: True
+      multiQC:
+        from json:
+          - "align_multiqc_report"
+          - "align_multiqc_data"
+          - "multiqc_data.json"
+        search recursively: True
+        logs directory: 
+          - "02-STAR_Alignment"
+        logs pattern(s):
+          - "*Log.final.out"
+        modules:
+          - "star"
+
+    - name: "rseqc: genebody coverage"
+      enabled: True
+      multiQC:
+        from json:
+          - "geneBody_cov_multiqc_report"
+          - "geneBody_cov_multiqc_data"
+          - "multiqc_data.json"        
+        search recursively: True
+        logs directory: 
+          - "RSeQC_Analyses"
+          - "02_geneBody_coverage"
+        logs pattern(s):
+          - "*.geneBodyCoverage.txt"
+        modules:
+          - "rseqc"
+
+    - name: "rseqc: infer experiment"
+      enabled: True
+      multiQC:
+        from json:
+          - "infer_exp_multiqc_report"
+          - "infer_exp_multiqc_data"
+          - "multiqc_data.json"
+        search recursively: True
+        logs directory: 
+          - "RSeQC_Analyses"
+          - "03_infer_experiment"    
+        logs pattern(s):
+          - "*infer_expt.out"
+        modules:
+          - "rseqc"
+
+    - name: "rseqc: inner distance"
+      enabled: True
+      multiQC:
+        from json:
+          - "inner_dist_multiqc_report"
+          - "inner_dist_multiqc_data"
+          - "multiqc_data.json"   
+        search recursively: True
+        logs directory: 
+          - "RSeQC_Analyses"
+          - "04_inner_distance"
+        logs pattern(s):
+          - "*inner_distance.txt"
+        modules:
+          - "rseqc"
+
+    - name: "rseqc: read distribution"
+      enabled: True
+      multiQC:
+        from json:
+          - "read_dist_multiqc_report"
+          - "read_dist_multiqc_data"
+          - "multiqc_data.json"   
+        search recursively: True
+        logs directory: 
+          - "RSeQC_Analyses"
+          - "05_read_distribution"
+        logs pattern(s):
+          - "*read_dist.out"
+        modules:
+          - "rseqc"
+
+
+    - name: "rsem count"
+      enabled: True 
+      multiQC:
+        from json:
+          - "RSEM_count_multiqc_report"
+          - "RSEM_count_multiqc_data"
+          - "multiqc_data.json"   
+        search recursively: True
+        logs directory: 
+          - "03-RSEM_Counts"
+        logs pattern(s):
+          - "*.stat"
+        modules:
+          - "rsem"
+
diff --git a/OSD-201/isa_config.yaml b/OSD-201/isa_config.yaml
new file mode 100644
index 0000000..fc6b92e
--- /dev/null
+++ b/OSD-201/isa_config.yaml
@@ -0,0 +1,1321 @@
+# TOP LEVEL
+NAME: "bulkRNASeq"
+VERSION: "1"
+
+# anchors for reuse
+_anchors:
+  rawDataDir: &rawDataDir "00-RawData"
+  trimDataDir: &trimDataDir "01-TG_Preproc"
+  alignDataDir: &alignDataDir "02-STAR_Alignment"
+  countsDataDir: &countsDataDir "03-RSEM_Counts"
+  normCountsDataDir: &normCountsDataDir "04-DESeq2_NormCounts"
+  DGEDataDir: &DGEDataDir "05-DESeq2_DGE"
+  rseqcDataDir: &rseqcDataDir "RSeQC_Analyses" # DISCUSS: Should this be renamed to "RSeQC_Analyses" for consistent casing? -J.O. , this has been renamed and differs from the recent bash based processings
+  ERCCAnalysisDir: &ERCCAnalysisDir "ERCC_Analysis"
+  FastQC_Reports: &FastQC_Reports "FastQC_Reports"
+  neverPublished: &neverPublished
+    subcategory: null
+    subdirectory: null
+    publish to repo: false
+    include subdirectory in table: false
+    table order: -1
+
+Staging:
+  General:
+    Required Metadata:
+      From ISA:
+        - ISA Field Name: Study Protocol Type
+          ISA Table Source: Investigation
+          Investigation Subtable: STUDY PROTOCOLS
+          # will return a boolean indicating if any of the following includes
+          True If Includes At Least One:
+            - spike-in quality control role
+            - spike-in protocol
+            - spike-in control
+            - spike-in control protocol
+          Runsheet Column Name: has_ERCC
+          Processing Usage: >-
+            Indicates is ERCC spike-in has been added. This can be automatically
+            determined from the ISA archive as well based on 'Study Protocol Name' and 'Study Protocol Type'
+          Example: 'TRUE'
+
+        - ISA Field Name: 
+            - Characteristics[Organism]
+            - Characteristics[organism]
+          ISA Table Source: Sample
+          Runsheet Column Name: organism
+          Processing Usage: >-
+            Mapping to the appropriate alignment reference and annotation databases.
+          Example: Arabidopsis thaliana
+
+        - ISA Field Name: 
+            - Characteristics[Material Type]
+            # - Characteristics[organism]
+          ISA Table Source: Assay
+          Runsheet Column Name: Tissue Type
+          Processing Usage: >-
+            Used for metrics table
+          Example: Left retina
+
+        - ISA Field Name: 
+            - Parameter Value[library selection]
+            # - Characteristics[organism]
+          ISA Table Source: Sample
+          Runsheet Column Name: Library Prep Method
+          Processing Usage: >-
+            Used for metrics table
+          Example: Left retina
+
+        - ISA Field Name: Sample Name
+          ISA Table Source: Assay
+          Runsheet Column Name: sample_name
+          Runsheet Index: true
+          Processing Usage: >-
+            Sample name is used as a unique sample identifier during processing
+          Example: Atha_Col-0_Root_WT_Ctrl_45min_Rep1_GSM502538
+    
+        - ISA Field Name: 
+            - Parameter Value[library layout]
+            - Parameter Value[Library Layout]
+          ISA Table Source: Assay
+          Runsheet Column Name: PE or SE
+          Remapping: {"PAIRED":'PE', "Paired":'PE', "SINGLE":'SE'}
+          Processing Usage: >-
+            Used for metrics table
+          Example: 'PE'
+    
+        # - ISA Field Name: 
+        #     - Parameter Value[Stranded]
+        #     - Parameter Value[stranded]
+        #   ISA Table Source: Assay
+        #   Runsheet Column Name: Stranded or Unstranded
+        #   # Remapping: {"PAIRED":'PE', "Paired":'PE', "SINGLE":'SE'}
+        #   Processing Usage: >-
+        #     Used for metrics table
+        #   Example: 'STRANDED'
+    
+        - ISA Field Name: 
+            - Parameter Value[rRNA Contamination]
+            # - Parameter Value[Library Layout]
+          ISA Table Source: Assay
+          Runsheet Column Name: '% rRNA contamination'
+          # Append Column Following: "Unit"
+          Processing Usage: >-
+            Used for metrics table
+          Example: '13.212 percent'
+    
+        # this entry denotes the following:
+        # retrive from that ISA field name
+        # multiple values (separated by ",")
+        # index those to certain runsheet columns
+        # if the index doesn't exist, optional prevents raising an exception
+        # GLDS URL Mapping means the names are searched against the GLDS filelisting json for urls
+        # an exception will be raised if one and only one url is not mapped to each filename
+        # - ISA Field Name: 
+        #     - Parameter Value[Merged Sequence Data File]
+        #     - Characteristics[Merged Sequence Data File]
+        #     - Raw Data File
+        #   ISA Table Source: Assay
+        #   Multiple Values Per Entry: true
+        #   Multiple Values Delimiter: '\s*,\s*' # whitespace surrounded comma
+        #   Runsheet Column Name: 
+        #     - {'name':'read1_path', 'index':0}
+        #     - {'name':'read2_path', 'index':1, 'optional':true}
+        #   GLDS URL Mapping: true
+        #   Processing Usage: >-
+        #     Location to the raw data fastq file. May be a url or local path.
+        #   Example: 'https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-194_rna...'
+
+        # - ISA Field Name: Factor Value[{factor_name}]
+        #   ISA Table Source: [Assay, Sample]
+        #   Runsheet Column Name: Factor Value[{factor_name}]
+        #   Matches Multiple Columns: true
+        #   Match Regex: "Factor Value\\[.*\\]"
+        #   Append Column Following: "Unit"
+        #   Processing Usage: >-
+        #     Factor values in a study. Used to assign experimental groups for each sample.
+        #     Note: On the runsheet, a subsequent 'Unit' Column value will be 
+        #     suffix-concatenated if it exists.
+        #   Example: Basal Control
+    
+        - ISA Field Name: Unit
+          ISA Table Source: [Assay, Sample]
+          Runsheet Column Name: null
+          Matches Multiple Columns: true
+          Autoload: false # handled by factor value loading above
+          Processing Usage: >-
+            Unit to be suffix-concatenated onto prior Factor value columns.
+          Example: day
+
+      From User:
+        # Removed since unused by Processing via the runsheet
+        # - Runsheet Column Name: GLDS
+        #   Processing Usage: >-
+        #     The GLDS accession number
+        #   Example: GLDS-205
+  
+        - Runsheet Column Name: read1_path
+          # used to generate candidate file names for searching GLDS repository filelisting
+          Data Asset Keys: ["raw forward reads fastq GZ", "raw reads fastq GZ"]
+          Processing Usage: >-
+            The location of either the forward reads (paired end) or only reads file (single end) 
+            raw fastq file. Can be either a url or local path.
+            Note: For GLDS raw data assets, either the filelisting json API or the OpenAPI
+            may be used to retrieve urls given the array data filename (sourced from ISA archive).
+          Example: /some/local/path OR https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-123_microarray_E-MTAB-3289.raw.1.zip?version=1
+    
+  
+        - Runsheet Column Name: read2_path
+          Data Asset Keys: ["raw reverse reads fastq GZ"]
+          Processing Usage: >-
+            The location of either the reverse reads (paired end)
+            raw fastq file. Can be either a url or local path.
+            For single end studies, this should be an empty string.
+            Note: For GLDS raw data assets, either the filelisting json API or the OpenAPI
+            may be used to retrieve urls given the array data filename (sourced from ISA archive).
+          Example: /some/local/path OR https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-123_microarray_E-MTAB-3289.raw.1.zip?version=1
+
+ISA Meta:
+  Valid Study Assay Technology And Measurement Types:
+    - measurement: "transcription profiling"
+      technology: "RNA Sequencing (RNA-Seq)"
+
+  # this is prepended to all file names in the curation assay table
+  Global file prefix: "{datasystem}_rna_seq_"
+
+  # configuration related to updating investigation file
+  # each must refer to a STUDY PROCESS in the 'ISA_investigation.yaml' file
+  # LEADCAP_organism should be the studied organisms scientific name with a leading cap
+  Post Processing Add Study Protocol: 
+    GeneLab RNAseq data processing protocol::{LEADCAP_organism} V1
+
+data assets:
+  runsheet:
+    processed location: 
+      - "Metadata"
+      - "{dataset}_bulkRNASeq_v1_runsheet.csv"
+
+    tags:
+      - raw
+
+    resource categories: *neverPublished
+
+  ISA Archive:
+    processed location: 
+      - "Metadata"
+      - "*-ISA.zip"
+
+    tags:
+      - raw
+
+    resource categories: *neverPublished
+
+  raw MultiQC directory:
+    processed location: 
+      - *rawDataDir
+      - *FastQC_Reports
+      - "raw_multiqc_report"
+
+    tags:
+      - raw
+
+    resource categories: *neverPublished
+
+  raw MultiQC directory ZIP:
+    processed location: 
+      - *rawDataDir
+      - *FastQC_Reports
+      - "raw_multiqc_report.zip"
+    
+    tags:
+      - raw
+
+    resource categories: &MergedSequenceData_MultiQCReports
+      subcategory: Merged Sequence Data
+      subdirectory: Multiqc Reports
+      publish to repo: true
+      include subdirectory in table: true
+      table order: 1
+
+  raw forward reads fastq GZ:
+    processed location:
+      - *rawDataDir
+      - "Fastq"
+      - "{sample}_R1_raw.fastq.gz"
+
+    tags:
+      - raw
+
+    resource categories: &MergedSequenceData_Fastq
+      subcategory: Merged Sequence Data
+      subdirectory: Fastq
+      publish to repo: true
+      include subdirectory in table: false
+      table order: 0
+
+  raw reverse reads fastq GZ:
+    processed location:
+      - *rawDataDir
+      - "Fastq"
+      - "{sample}_R2_raw.fastq.gz"
+
+    tags:
+      - raw
+      
+    resource categories: *MergedSequenceData_Fastq
+
+  raw reads fastq GZ:
+    processed location:
+      - *rawDataDir
+      - "Fastq"
+      - "{sample}_raw.fastq.gz"
+
+    tags:
+      - raw
+      
+    resource categories: *MergedSequenceData_Fastq
+
+  raw forward reads fastQC HTML:
+    processed location:
+      - *rawDataDir
+      - *FastQC_Reports
+      - "{sample}_R1_raw_fastqc.html"
+
+    tags:
+      - raw
+      
+    resource categories: *neverPublished
+
+  # J.Oribello: We should revisit this, fastQC includes some unique (not parsed
+  # into multiQC) relevant information like the actual overrepresented sequence strings
+  raw reverse reads fastQC HTML:
+    processed location:
+      - *rawDataDir
+      - *FastQC_Reports
+      - "{sample}_R2_raw_fastqc.html"
+
+    tags:
+      - raw
+      
+    resource categories: *neverPublished
+
+  raw reads fastQC HTML:
+    processed location:
+      - *rawDataDir
+      - *FastQC_Reports
+      - "{sample}_raw_fastqc.html"
+
+    tags:
+      - raw
+      
+    resource categories: *neverPublished
+
+  raw forward reads fastQC ZIP: 
+    processed location:
+      - *rawDataDir
+      - *FastQC_Reports
+      - "{sample}_R1_raw_fastqc.zip"
+
+    tags:
+      - raw
+      
+    resource categories: *neverPublished
+
+  raw reverse reads fastQC ZIP:
+    processed location:
+      - *rawDataDir
+      - *FastQC_Reports
+      - "{sample}_R2_raw_fastqc.zip"
+
+    tags:
+      - raw
+      
+    resource categories: *neverPublished
+
+  raw reads fastQC ZIP:
+    processed location:
+      - *rawDataDir
+      - *FastQC_Reports
+      - "{sample}_raw_fastqc.zip"
+
+    tags:
+      - raw
+      
+    resource categories: *neverPublished
+
+  trimmed fastQC MultiQC directory:
+    processed location: 
+      - *trimDataDir
+      - *FastQC_Reports
+      - "trimmed_multiqc_report"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  trimmed fastQC MultiQC directory ZIP:
+    processed location: 
+      - *trimDataDir
+      - *FastQC_Reports
+      - "trimmed_multiqc_report.zip"
+
+    tags:
+      - processed
+      
+    resource categories: &TrimmedSequenceData_MultiQCReports
+      subcategory: Trimmed Sequence Data
+      subdirectory: Multiqc Reports
+      publish to repo: true
+      include subdirectory in table: true
+      table order: 4
+
+  trimmed forward reads fastq GZ: &trimmedFastqGZ
+    processed location:
+      - *trimDataDir
+      - "Fastq"
+      - "{sample}_R1_trimmed.fastq.gz"
+
+    tags:
+      - processed
+      
+    resource categories:
+      subcategory: Trimmed Sequence Data
+      subdirectory: Fastq
+      publish to repo: true
+      include subdirectory in table: false
+      table order: 3
+
+  trimmed reverse reads fastq GZ:
+    <<: *trimmedFastqGZ
+    processed location:
+      - *trimDataDir
+      - "Fastq"
+      - "{sample}_R2_trimmed.fastq.gz"
+
+    tags:
+      - processed
+      
+  trimmed reads fastq GZ:
+    <<: *trimmedFastqGZ
+    processed location:
+      - *trimDataDir
+      - "Fastq"
+      - "{sample}_trimmed.fastq.gz"
+
+    tags:
+      - processed
+      
+  trimmed forward reads fastQC HTML: &trimmedForwardReadsFastQCHTML
+    processed location:
+      - *trimDataDir
+      - *FastQC_Reports
+      - "{sample}_R1_trimmed_fastqc.html"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  trimmed reverse reads fastQC HTML: 
+    <<: *trimmedForwardReadsFastQCHTML
+    processed location:
+      - *trimDataDir
+      - *FastQC_Reports
+      - "{sample}_R2_trimmed_fastqc.html"
+
+    tags:
+      - processed
+      
+  trimmed reads fastQC HTML: 
+    <<: *trimmedForwardReadsFastQCHTML
+    processed location:
+      - *trimDataDir
+      - *FastQC_Reports
+      - "{sample}_trimmed_fastqc.html"
+
+    tags:
+      - processed
+      
+  trimmed forward reads fastQC ZIP: &trimmedForwardReadsFastQCZIP
+    processed location:
+      - *trimDataDir
+      - *FastQC_Reports
+      - "{sample}_R1_trimmed_fastqc.zip"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  trimmed reverse reads fastQC ZIP: 
+    <<: *trimmedForwardReadsFastQCZIP
+    processed location:
+      - *trimDataDir
+      - *FastQC_Reports
+      - "{sample}_R2_trimmed_fastqc.zip"
+
+    tags:
+      - processed
+      
+  trimmed reads fastQC ZIP: 
+    <<: *trimmedForwardReadsFastQCZIP
+    processed location:
+      - *trimDataDir
+      - *FastQC_Reports
+      - "{sample}_trimmed_fastqc.zip"
+
+    tags:
+      - processed
+      
+  trimming MultiQC directory:
+    processed location: 
+      - *trimDataDir
+      - &trimmingReportsDir "Trimming_Reports"
+      - "trimming_multiqc_report"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  forward reads trimming report: &trimmedForwardReadsFastQCTrimmingReport
+    processed location:
+      - *trimDataDir
+      - *trimmingReportsDir
+      - "{sample}_R1_raw.fastq.gz_trimming_report.txt"
+
+    tags:
+      - processed
+      
+    resource categories: 
+      subcategory: Trimmed Sequence Data
+      subdirectory: Trimming Reports
+      publish to repo: true
+      include subdirectory in table: true
+      table order: 5
+
+  reverse reads trimming report: 
+    <<: *trimmedForwardReadsFastQCTrimmingReport
+    processed location:
+      - *trimDataDir
+      - *trimmingReportsDir
+      - "{sample}_R2_raw.fastq.gz_trimming_report.txt"
+
+    tags:
+      - processed
+      
+  reads trimming report: 
+    <<: *trimmedForwardReadsFastQCTrimmingReport
+    processed location:
+      - *trimDataDir
+      - *trimmingReportsDir
+      - "{sample}_raw.fastq.gz_trimming_report.txt"
+
+    tags:
+      - processed
+      
+  aligned MultiQC directory:
+    processed location: 
+      - *alignDataDir
+      - "align_multiqc_report"
+
+    resource categories: *neverPublished
+
+    tags:
+      - processed
+      
+  aligned MultiQC directory ZIP:
+    processed location: 
+      - *alignDataDir
+      - "align_multiqc_report.zip"
+
+    tags:
+      - processed
+      
+    resource categories: &AlignedSequenceData_MultiQCReports
+      subcategory: Aligned Sequence Data # RENAME: from 'Aligned sequence data'. For consistency with Title casing across the board
+      subdirectory: MultiQC Reports # RENAME: from 'MultiQC Reports'. For consistency with Title casing across the board
+      publish to repo: true
+      include subdirectory in table: true
+      table order: 8
+
+  aligned ToTranscriptome Bam:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_Aligned.toTranscriptome.out.bam"
+
+    tags:
+      - processed
+      
+    resource categories: &AlignedSequenceData_AlignedData
+      subcategory: Aligned Sequence Data
+      subdirectory: Aligned Data
+      publish to repo: true
+      include subdirectory in table: false
+      table order: 6
+
+  aligned SortedByCoord Bam:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_Aligned.sortedByCoord.out.bam"
+
+    tags:
+      - processed
+      
+    resource categories:  *neverPublished
+
+  aligned SortedByCoord ResortedBam:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_Aligned.sortedByCoord_sorted.out.bam"
+
+    tags:
+      - processed
+      
+    resource categories: *AlignedSequenceData_AlignedData
+
+  aligned SortedByCoord ResortedBamIndex:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_Aligned.sortedByCoord_sorted.out.bam.bai"
+
+    tags:
+      - processed
+      
+    resource categories: *AlignedSequenceData_AlignedData
+
+  aligned log Final:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_Log.final.out"
+
+    tags:
+      - processed
+      
+    resource categories: &AlignedSequenceData_AlignmentLogs
+      subcategory: Aligned Sequence Data
+      subdirectory: Alignment Logs
+      publish to repo: true
+      include subdirectory in table: true
+      table order: 7
+
+  aligned log Progress:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_Log.progress.out"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  aligned log Full:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_Log.out"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  aligned sjTab:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_SJ.out.tab"
+
+    tags:
+      - processed
+      
+    resource categories: *AlignedSequenceData_AlignedData
+
+  genebody coverage MultiQC directory:
+    processed location: 
+      - *rseqcDataDir
+      - "02_geneBody_coverage"
+      - "geneBody_cov_multiqc_report"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  genebody coverage MultiQC directory ZIP:
+    processed location: 
+      - *rseqcDataDir
+      - "02_geneBody_coverage"
+      - "geneBody_cov_multiqc_report.zip"
+
+    tags:
+      - processed
+      
+    resource categories: &RSeQC_MultiQCReports
+      subcategory: RSeQC
+      subdirectory: MultiQC Reports
+      publish to repo: true
+      include subdirectory in table: true
+      table order: 9
+
+  infer experiment MultiQC directory:
+    processed location: 
+      - *rseqcDataDir
+      - "03_infer_experiment"
+      - "infer_exp_multiqc_report"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  infer experiment MultiQC directory ZIP:
+    processed location: 
+      - *rseqcDataDir
+      - "03_infer_experiment"
+      - "infer_exp_multiqc_report.zip"
+
+    tags:
+      - processed
+      
+    resource categories: *RSeQC_MultiQCReports
+
+  inner distance MultiQC directory:
+    processed location: 
+      - *rseqcDataDir
+      - "04_inner_distance"
+      - "inner_dist_multiqc_report"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  inner distance MultiQC directory ZIP:
+    processed location: 
+      - *rseqcDataDir
+      - "04_inner_distance"
+      - "inner_dist_multiqc_report.zip"
+
+    tags:
+      - processed
+      
+    resource categories: *RSeQC_MultiQCReports
+
+  read distribution MultiQC directory:
+    processed location: 
+      - *rseqcDataDir
+      - "05_read_distribution"
+      - "read_dist_multiqc_report"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  read distribution MultiQC directory ZIP:
+    processed location: 
+      - *rseqcDataDir
+      - "05_read_distribution"
+      - "read_dist_multiqc_report.zip"
+
+    tags:
+      - processed
+      
+    resource categories: *RSeQC_MultiQCReports
+
+  genebody coverage out:
+    processed location: 
+      - *rseqcDataDir
+      - "02_geneBody_coverage"
+      - "{sample}"
+
+    tags:
+      - processed
+      
+    # TODO: DISCUSS Consider this for directories that are handled the same but should validate contents
+    # is directory: true
+    # contents:
+    #   - ["{sample}.geneBodyCoverage.r"]
+    #   - ["{sample}.geneBodyCoverage.txt"]
+    #   - ["{sample}.geneBodyCoverage.curves.pdf"]
+
+    resource categories: *neverPublished
+
+  infer experiment out:
+    processed location: 
+      - *rseqcDataDir
+      - "03_infer_experiment"
+      - "{sample}_infer_expt.out"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  inner distance out:
+    processed location: 
+      - *rseqcDataDir
+      - "04_inner_distance"
+      - "{sample}"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  read distribution out:
+    processed location: 
+      - *rseqcDataDir
+      - "05_read_distribution"
+      - "{sample}_read_dist.out"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  RSEM counts MultiQC directory:
+    processed location: 
+      - *countsDataDir
+      - "RSEM_count_multiqc_report" # RENAMED from count_multiqc_report  as of 4/14/2022
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  RSEM counts MultiQC directory ZIP:
+    processed location: 
+      - *countsDataDir
+      - "RSEM_count_multiqc_report.zip"
+
+    tags:
+      - processed
+      
+    resource categories: &RawCountsData_MultiQCReports
+      subcategory: Raw Counts Data
+      subdirectory: Multiqc Reports
+      publish to repo: true
+      include subdirectory in table: true
+      table order: 11
+
+  star number non-zero count genes table:
+    processed location: 
+      - *alignDataDir
+      - "STAR_NumNonZeroGenes.csv"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  star unnormalized counts table:
+    processed location: 
+      - *alignDataDir
+      - "STAR_Unnormalized_Counts.csv"
+
+    tags:
+      - processed
+      
+    resource categories: &RawCountsTables
+      subcategory: Raw Counts Tables
+      subdirectory: ""
+      publish to repo: true
+      include subdirectory in table: false
+      table order: 12
+
+  rsem number non-zero count genes table:
+    processed location: 
+      - *countsDataDir
+      - "RSEM_NumNonZeroGenes.csv"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  rsem unnormalized counts table:
+    processed location: 
+      - *countsDataDir
+      - "RSEM_Unnormalized_Counts.csv" # RENAMED from 'Unnormalized_Counts.csv'
+
+    tags:
+      - processed
+      
+    resource categories: *RawCountsTables
+
+  sample reads per gene table:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_ReadsPerGene.out.tab"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished # TODO: Discuss, should this be repo published? In what way?
+
+  sample gene counts table:
+    processed location: 
+      - *countsDataDir
+      # Removed - "{sample}", DISCUSS: Since this directory contains multiple files per sample, should this be nested in sample-wise dirs consistent with STAR and RSeQC. J.O.
+      - "{sample}.genes.results"
+
+    tags:
+      - processed
+      
+    resource categories: &RawCountsData_CountData
+      subcategory: Raw Counts Data
+      subdirectory: Count Data
+      publish to repo: true
+      include subdirectory in table: false
+      table order: 10
+
+  sample isoform counts table:
+    processed location: 
+      - *countsDataDir
+      # Removed - "{sample}", DISCUSS: Since this directory contains multiple files per sample, should this be nested in sample-wise dirs consistent with STAR and RSeQC. J.O.
+      - "{sample}.isoforms.results"
+
+    tags:
+      - processed
+            
+    resource categories: *RawCountsData_CountData
+
+  sample counts stats directory:
+    processed location: 
+      - *countsDataDir
+      # Removed - "{sample}", DISCUSS: Since this directory contains multiple files per sample, should this be nested in sample-wise dirs consistent with STAR and RSeQC. J.O.
+      - "{sample}.stat"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  DESeq2 normalized counts table:
+    processed location: 
+      - *normCountsDataDir
+      - "Normalized_Counts.csv"
+
+    tags:
+      - processed
+            
+    resource categories: &normalizedCountsData
+      subcategory: Normalized Counts Data
+      subdirectory: ""
+      publish to repo: true
+      include subdirectory in table: false
+      table order: 13
+
+  ERCC normalized DESeq2 normalized counts table:
+    processed location: 
+      - *normCountsDataDir
+      - "ERCC_Normalized_Counts.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *normalizedCountsData
+
+  sample table:
+    processed location: 
+      - *DGEDataDir
+      - "SampleTable.csv"
+
+    tags:
+      - processed
+            
+    resource categories: &DGEAnalysisData
+      subcategory: Differential Expression Analysis Data
+      subdirectory: ""
+      publish to repo: true
+      include subdirectory in table: false
+      table order: 14
+
+  ERCC sample table:
+    processed location: 
+      - *DGEDataDir
+      - &erccSubDir "ERCC_NormDGE"
+      - "ERCCnorm_SampleTable.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *DGEAnalysisData
+
+  DESeq2 unnormalized counts table:
+    processed location: 
+      - *normCountsDataDir
+      - "RSEM_Unnormalized_Counts.csv" # RENAMED: from "Unnormalized_Counts.csv" for clarity
+
+    tags:
+      - processed
+            
+    resource categories: *neverPublished # DISCUSS: temporary name clash resolution for publishables
+
+  DESeq2 contrasts table:
+    processed location: 
+      - *DGEDataDir
+      - "contrasts.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *DGEAnalysisData
+
+  ERCC normalized DESeq2 contrasts table:
+    processed location: 
+      - *DGEDataDir
+      - *erccSubDir
+      - "ERCCnorm_contrasts.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *DGEAnalysisData
+
+  DESeq2 annotated DGE table:
+    processed location: 
+      - *DGEDataDir
+      - "differential_expression.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *DGEAnalysisData
+
+  ERCC normalized DESeq2 annotated DGE table:
+    processed location: 
+      - *DGEDataDir
+      - *erccSubDir
+      - "ERCCnorm_differential_expression.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *DGEAnalysisData
+
+  DESeq2 annotated DGE extended for viz table:
+    processed location: 
+      - *DGEDataDir
+      - "visualization_output_table.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *neverPublished
+
+  ERCC normalized DESeq2 annotated DGE extended for viz table:
+    processed location: 
+      - *DGEDataDir
+      - *erccSubDir
+      - "visualization_output_table_ERCCnorm.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *neverPublished
+
+  DESeq2 viz PCA table:
+    processed location: 
+      - *DGEDataDir
+      - "visualization_PCA_table.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *neverPublished
+
+  ERCC normalized DESeq2 viz PCA table:
+    processed location: 
+      - *DGEDataDir
+      - *erccSubDir
+      - "visualization_PCA_table_ERCCnorm.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *neverPublished
+
+
+  ERCC analysis HTML:
+    processed location: 
+      - *ERCCAnalysisDir
+      - "ERCC_analysis.html"
+
+    tags:
+      - processed
+
+    conditional on dataset:
+      - has_ERCC: [True]
+            
+    resource categories:
+      subcategory: ERCC Analyses
+      subdirectory: ""
+      publish to repo: true
+      include subdirectory in table: false
+      table order: 15
+
+    # NOTE: this is while the ERCC analysis sits outside the full pipeline and
+    # once incoporated, it should be validated for existence!
+    validate exists: false 
+
+# Assets that are no longer generated by the latest pipeline
+Archived Data Assets:
+
+  # DISCUSS: When Trim Galore MQC if made clearer, publishing this should be revisited
+  # Currently this only reports the direct cutadapt related trimming and misses Trim-Galore
+  # Specific metrics.
+  # - Jonathan Oribello
+  trimming MultiQC directory ZIP:
+    processed location: 
+      - *trimDataDir
+      - *trimmingReportsDir
+      - "trimming_multiqc_report.zip"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+
+data asset sets:
+  # These assets are not generated in the workflow, but are generated after the workflow
+  PUTATIVE:
+    - "ERCC analysis HTML"
+  glds metadata:
+    - "ISA Archive"
+  has ercc:
+    - "ERCC normalized DESeq2 normalized counts table"
+    - "ERCC sample table"
+    - "ERCC normalized DESeq2 contrasts table"
+    - "ERCC normalized DESeq2 annotated DGE table"
+    - "ERCC normalized DESeq2 annotated DGE extended for viz table"
+    - "ERCC normalized DESeq2 viz PCA table"
+    # NOTE: Not part of NF_WF yet - "ERCC analysis HTML"
+  demuliplexed paired end raw data:
+    - "runsheet"
+    - "raw forward reads fastq GZ"
+    - "raw reverse reads fastq GZ"
+  qc reports for paired end raw data:
+    - "raw forward reads fastQC HTML"
+    - "raw reverse reads fastQC HTML"
+    - "raw forward reads fastQC ZIP"
+    - "raw reverse reads fastQC ZIP"
+    - "raw MultiQC directory"
+    - "raw MultiQC directory ZIP"
+  paired end trimmed reads:
+    - "trimmed forward reads fastq GZ"
+    - "trimmed reverse reads fastq GZ"
+  qc reports for paired end trimmed reads data:
+    - "trimmed forward reads fastQC HTML"
+    - "trimmed reverse reads fastQC HTML"
+    - "trimmed forward reads fastQC ZIP"
+    - "trimmed reverse reads fastQC ZIP"
+    - "trimmed fastQC MultiQC directory"
+    - "trimming MultiQC directory"
+    - "forward reads trimming report"
+    - "reverse reads trimming report"
+  demuliplexed single end raw data:
+    - "runsheet"
+    - "raw reads fastq GZ"
+  qc reports for single end raw data:
+    - "raw reads fastQC HTML"
+    - "raw reads fastQC ZIP"
+    - "raw MultiQC directory"
+    - "raw MultiQC directory ZIP"
+  single end trimmed reads:
+    - "trimmed reads fastq GZ"
+  qc reports for single end trimmed reads data:
+    - "trimmed reads fastQC HTML"
+    - "trimmed reads fastQC ZIP"
+    - "trimmed fastQC MultiQC directory"
+    - "trimming MultiQC directory"
+    - "reads trimming report"
+  STAR alignments:
+    - "aligned MultiQC directory"
+    - "aligned MultiQC directory ZIP"
+    - "aligned ToTranscriptome Bam"
+    - "aligned SortedByCoord Bam"
+    - "aligned SortedByCoord ResortedBam"
+    - "aligned SortedByCoord ResortedBamIndex"
+    - "aligned log Final"
+    - "aligned log Progress"
+    - "aligned log Full"
+    - "aligned sjTab"
+    - "sample reads per gene table"
+    - "star number non-zero count genes table"
+    - "star unnormalized counts table"
+  RSeQC output for paired end data:
+    - "genebody coverage MultiQC directory"
+    - "genebody coverage MultiQC directory ZIP"
+    - "infer experiment MultiQC directory"
+    - "infer experiment MultiQC directory ZIP"
+    - "inner distance MultiQC directory"
+    - "inner distance MultiQC directory ZIP"
+    - "read distribution MultiQC directory"
+    - "read distribution MultiQC directory ZIP"
+    - "genebody coverage out"
+    - "infer experiment out"
+    - "inner distance out"
+    - "read distribution out"
+  RSeQC output for single end data:
+    - "genebody coverage MultiQC directory"
+    - "genebody coverage MultiQC directory ZIP"
+    - "infer experiment MultiQC directory"
+    - "infer experiment MultiQC directory ZIP"
+    - "read distribution MultiQC directory"
+    - "read distribution MultiQC directory ZIP"
+    - "genebody coverage out"
+    - "infer experiment out"
+    - "read distribution out"
+  RSEM counts:
+    - "RSEM counts MultiQC directory"
+    - "RSEM counts MultiQC directory ZIP"
+    - "rsem number non-zero count genes table"
+    - "rsem unnormalized counts table"
+    - "sample gene counts table"
+    - "sample isoform counts table"
+    - "sample counts stats directory"
+  is single end full:
+    - "runsheet"
+    - "ISA Archive"
+    - "raw MultiQC directory"
+    - "raw MultiQC directory ZIP"
+    - "raw reads fastq GZ"
+    - "raw reads fastQC HTML"
+    - "raw reads fastQC ZIP"
+    - "trimmed fastQC MultiQC directory"
+    - "trimmed fastQC MultiQC directory ZIP"
+    - "trimmed reads fastq GZ"
+    - "trimmed reads fastQC HTML"
+    - "trimmed reads fastQC ZIP"
+    - "trimming MultiQC directory"
+    - "reads trimming report"
+    - "aligned MultiQC directory"
+    - "aligned MultiQC directory ZIP"
+    - "aligned ToTranscriptome Bam"
+    - "aligned SortedByCoord Bam"
+    - "aligned SortedByCoord ResortedBam"
+    - "aligned SortedByCoord ResortedBamIndex"
+    - "aligned log Final"
+    - "aligned log Progress"
+    - "aligned log Full"
+    - "aligned sjTab"
+    - "genebody coverage MultiQC directory"
+    - "genebody coverage MultiQC directory ZIP"
+    - "infer experiment MultiQC directory"
+    - "infer experiment MultiQC directory ZIP"
+    - "read distribution MultiQC directory"
+    - "read distribution MultiQC directory ZIP"
+    - "genebody coverage out"
+    - "infer experiment out"
+    - "read distribution out"
+    - "RSEM counts MultiQC directory"
+    - "RSEM counts MultiQC directory ZIP"
+    - "star number non-zero count genes table"
+    - "star unnormalized counts table"
+    - "rsem number non-zero count genes table"
+    - "rsem unnormalized counts table"
+    - "sample reads per gene table"
+    - "sample gene counts table"
+    - "sample isoform counts table"
+    - "sample counts stats directory"
+    - "DESeq2 normalized counts table"
+    - "sample table"
+    - "DESeq2 unnormalized counts table"
+    - "DESeq2 contrasts table"
+    - "DESeq2 annotated DGE table"
+    - "DESeq2 annotated DGE extended for viz table"
+    - "DESeq2 viz PCA table"
+  is paired end full:
+    - "runsheet"
+    - "ISA Archive"
+    - "raw MultiQC directory"
+    - "raw MultiQC directory ZIP"
+    - "raw forward reads fastq GZ"
+    - "raw reverse reads fastq GZ"
+    - "raw forward reads fastQC HTML"
+    - "raw reverse reads fastQC HTML"
+    - "raw forward reads fastQC ZIP"
+    - "raw reverse reads fastQC ZIP"
+    - "trimmed fastQC MultiQC directory"
+    - "trimmed fastQC MultiQC directory ZIP"
+    - "trimmed forward reads fastq GZ"
+    - "trimmed reverse reads fastq GZ"
+    - "trimmed forward reads fastQC HTML"
+    - "trimmed reverse reads fastQC HTML"
+    - "trimmed forward reads fastQC ZIP"
+    - "trimmed reverse reads fastQC ZIP"
+    - "trimming MultiQC directory"
+    - "forward reads trimming report"
+    - "reverse reads trimming report"
+    - "aligned MultiQC directory"
+    - "aligned MultiQC directory ZIP"
+    - "aligned ToTranscriptome Bam"
+    - "aligned SortedByCoord Bam"
+    - "aligned SortedByCoord ResortedBam"
+    - "aligned SortedByCoord ResortedBamIndex"
+    - "aligned log Final"
+    - "aligned log Progress"
+    - "aligned log Full"
+    - "aligned sjTab"
+    - "genebody coverage MultiQC directory"
+    - "genebody coverage MultiQC directory ZIP"
+    - "infer experiment MultiQC directory"
+    - "infer experiment MultiQC directory ZIP"
+    - "inner distance MultiQC directory"
+    - "inner distance MultiQC directory ZIP"
+    - "read distribution MultiQC directory"
+    - "read distribution MultiQC directory ZIP"
+    - "genebody coverage out"
+    - "infer experiment out"
+    - "inner distance out"
+    - "read distribution out"
+    - "RSEM counts MultiQC directory"
+    - "RSEM counts MultiQC directory ZIP"
+    - "star number non-zero count genes table"
+    - "star unnormalized counts table"
+    - "rsem number non-zero count genes table"
+    - "rsem unnormalized counts table"
+    - "sample reads per gene table"
+    - "sample gene counts table"
+    - "sample isoform counts table"
+    - "sample counts stats directory"
+    - "DESeq2 normalized counts table"
+    - "sample table"
+    - "DESeq2 unnormalized counts table"
+    - "DESeq2 contrasts table"
+    - "DESeq2 annotated DGE table"
+    - "DESeq2 annotated DGE extended for viz table"
+    - "DESeq2 viz PCA table"
+  DGE Output:
+    - "DESeq2 normalized counts table"
+    - "sample table"
+    - "DESeq2 unnormalized counts table"
+    - "DESeq2 contrasts table"
+    - "DESeq2 annotated DGE table"
+    - "DESeq2 annotated DGE extended for viz table"
+    - "DESeq2 viz PCA table"
+  ERCC DGE Output:
+    - "ERCC normalized DESeq2 normalized counts table"
+    - "ERCC sample table"
+    - "ERCC normalized DESeq2 contrasts table"
+    - "ERCC normalized DESeq2 annotated DGE table"
+    - "ERCC normalized DESeq2 annotated DGE extended for viz table"
+    - "ERCC normalized DESeq2 viz PCA table"
+    # NOTE: Not part of NF_WF yet - "ERCC analysis HTML"
+  RSEM Output:
+    - "RSEM counts MultiQC directory"
+    - "RSEM counts MultiQC directory ZIP"
+    - "rsem number non-zero count genes table"
+    - "rsem unnormalized counts table"
\ No newline at end of file
diff --git a/assets/extraction_conf.yaml b/assets/extraction_conf.yaml
new file mode 100644
index 0000000..d4d7b5f
--- /dev/null
+++ b/assets/extraction_conf.yaml
@@ -0,0 +1,49 @@
+Extraction Settings:
+  root search directory: "/CHANGEME/TO/WHERE/MQC/ARE"
+  sections:
+    - name: "raw reads" 
+      enabled: True
+      multiQC:
+        from json:
+          - "raw_multiqc_report"
+          - "raw_multiqc_data"
+          - "multiqc_data.json"
+        search recursively: True
+        logs directory: 
+          - "00-RawData"
+          - "FastQC_Reports"
+        logs pattern(s):
+          - "*fastqc.zip"
+        modules:
+          - "fastqc"
+
+    - name: "trimmed reads"
+      enabled: True
+      multiQC:
+        from json:
+          - "trimmed_multiqc_report"
+          - "trimmed_multiqc_data"
+          - "multiqc_data.json"
+        search recursively: False
+        logs directory: 
+          - "01-TG_Preproc"
+          - "FastQC_Reports"
+        logs pattern(s):
+          - "*fastqc.zip"
+        modules:
+          - "fastqc"
+
+    - name: "aligned reads"
+      enabled: True
+      multiQC:
+        from json:
+          - "align_multiqc_report"
+          - "align_multiqc_data"
+          - "multiqc_data.json"
+        search recursively: True
+        logs directory: 
+          - "02-STAR_Alignment"
+        logs pattern(s):
+          - "*Log.final.out"
+        modules:
+          - "star"
diff --git a/assets/isa_config.yaml b/assets/isa_config.yaml
new file mode 100644
index 0000000..bb7ebb9
--- /dev/null
+++ b/assets/isa_config.yaml
@@ -0,0 +1,1325 @@
+# TOP LEVEL
+NAME: "bulkRNASeq"
+VERSION: "1"
+
+# anchors for reuse
+_anchors:
+  rawDataDir: &rawDataDir "00-RawData"
+  trimDataDir: &trimDataDir "01-TG_Preproc"
+  alignDataDir: &alignDataDir "02-STAR_Alignment"
+  countsDataDir: &countsDataDir "03-RSEM_Counts"
+  normCountsDataDir: &normCountsDataDir "04-DESeq2_NormCounts"
+  DGEDataDir: &DGEDataDir "05-DESeq2_DGE"
+  rseqcDataDir: &rseqcDataDir "RSeQC_Analyses" # DISCUSS: Should this be renamed to "RSeQC_Analyses" for consistent casing? -J.O. , this has been renamed and differs from the recent bash based processings
+  ERCCAnalysisDir: &ERCCAnalysisDir "ERCC_Analysis"
+  FastQC_Reports: &FastQC_Reports "FastQC_Reports"
+  neverPublished: &neverPublished
+    subcategory: null
+    subdirectory: null
+    publish to repo: false
+    include subdirectory in table: false
+    table order: -1
+
+Staging:
+  General:
+    Required Metadata:
+      From ISA:
+        - ISA Field Name: Study Protocol Type
+          ISA Table Source: Investigation
+          Investigation Subtable: STUDY PROTOCOLS
+          # will return a boolean indicating if any of the following includes
+          True If Includes At Least One:
+            - spike-in quality control role
+            - spike-in protocol
+            - spike-in control
+            - spike-in control protocol
+          Runsheet Column Name: has_ERCC
+          Processing Usage: >-
+            Indicates is ERCC spike-in has been added. This can be automatically
+            determined from the ISA archive as well based on 'Study Protocol Name' and 'Study Protocol Type'
+          Example: 'TRUE'
+
+        - ISA Field Name: 
+            - Characteristics[Organism]
+            - Characteristics[organism]
+          ISA Table Source: Sample
+          Runsheet Column Name: organism
+          Processing Usage: >-
+            Mapping to the appropriate alignment reference and annotation databases.
+          Example: Arabidopsis thaliana
+
+        - ISA Field Name: 
+            - Characteristics[Material Type]
+            # - Characteristics[organism]
+          ISA Table Source: Assay
+          Runsheet Column Name: Tissue Type
+          Processing Usage: >-
+            Used to metrics table
+          Example: Left retina
+          Fallback Value: "NOT FOUND IN ISA COLUMNS"
+
+        - ISA Field Name: 
+            - Parameter Value[library selection]
+            - Parameter Value[library Selection] # Alternative casing found first in OSD-120
+            # - Characteristics[organism]
+          ISA Table Source: Sample
+          Runsheet Column Name: Library Prep Method
+          Processing Usage: >-
+            Used to metrics table
+          Example: Left retina
+          Fallback Value: "NOT FOUND IN ISA COLUMNS"
+
+        - ISA Field Name: Sample Name
+          ISA Table Source: Assay
+          Runsheet Column Name: sample_name
+          Runsheet Index: true
+          Processing Usage: >-
+            Sample name is used as a unique sample identifier during processing
+          Example: Atha_Col-0_Root_WT_Ctrl_45min_Rep1_GSM502538
+    
+        - ISA Field Name: 
+            - Parameter Value[library layout]
+            - Parameter Value[Library Layout]
+          ISA Table Source: Assay
+          Runsheet Column Name: PE or SE
+          Remapping: {"PAIRED":'PE', "Paired":'PE', "SINGLE":'SE'}
+          Processing Usage: >-
+            Used to metrics table
+          Example: 'PE'
+    
+        - ISA Field Name: 
+            - Parameter Value[Stranded]
+            - Parameter Value[stranded]
+          ISA Table Source: Assay
+          Runsheet Column Name: Stranded or Unstranded
+          # Remapping: {"PAIRED":'PE', "Paired":'PE', "SINGLE":'SE'}
+          Processing Usage: >-
+            Used to metrics table
+          Example: 'STRANDED'
+          Fallback Value: "NOT FOUND IN ISA COLUMNS"
+
+        - ISA Field Name: 
+            - Parameter Value[rRNA Contamination]
+            # - Parameter Value[Library Layout]
+          ISA Table Source: Assay
+          Runsheet Column Name: '% rRNA contamination'
+          # Append Column Following: "Unit"
+          Processing Usage: >-
+            Used to metrics table
+          Example: '13.212 percent'
+    
+        # this entry denotes the following:
+        # retrive from that ISA field name
+        # multiple values (separated by ",")
+        # index those to certain runsheet columns
+        # if the index doesn't exist, optional prevents raising an exception
+        # GLDS URL Mapping means the names are searched against the GLDS filelisting json for urls
+        # an exception will be raised if one and only one url is not mapped to each filename
+        # - ISA Field Name: 
+        #     - Parameter Value[Merged Sequence Data File]
+        #     - Characteristics[Merged Sequence Data File]
+        #     - Raw Data File
+        #   ISA Table Source: Assay
+        #   Multiple Values Per Entry: true
+        #   Multiple Values Delimiter: '\s*,\s*' # whitespace surrounded comma
+        #   Runsheet Column Name: 
+        #     - {'name':'read1_path', 'index':0}
+        #     - {'name':'read2_path', 'index':1, 'optional':true}
+        #   GLDS URL Mapping: true
+        #   Processing Usage: >-
+        #     Location to the raw data fastq file. May be a url or local path.
+        #   Example: 'https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-194_rna...'
+
+        # - ISA Field Name: Factor Value[{factor_name}]
+        #   ISA Table Source: [Assay, Sample]
+        #   Runsheet Column Name: Factor Value[{factor_name}]
+        #   Matches Multiple Columns: true
+        #   Match Regex: "Factor Value\\[.*\\]"
+        #   Append Column Following: "Unit"
+        #   Processing Usage: >-
+        #     Factor values in a study. Used to assign experimental groups for each sample.
+        #     Note: On the runsheet, a subsequent 'Unit' Column value will be 
+        #     suffix-concatenated if it exists.
+        #   Example: Basal Control
+    
+        - ISA Field Name: Unit
+          ISA Table Source: [Assay, Sample]
+          Runsheet Column Name: null
+          Matches Multiple Columns: true
+          Autoload: false # handled by factor value loading above
+          Processing Usage: >-
+            Unit to be suffix-concatenated onto prior Factor value columns.
+          Example: day
+
+      From User:
+        # Removed since unused by Processing via the runsheet
+        # - Runsheet Column Name: GLDS
+        #   Processing Usage: >-
+        #     The GLDS accession number
+        #   Example: GLDS-205
+  
+        - Runsheet Column Name: read1_path
+          # used to generate candidate file names for searching GLDS repository filelisting
+          Data Asset Keys: ["raw forward reads fastq GZ", "raw reads fastq GZ"]
+          Processing Usage: >-
+            The location of either the forward reads (paired end) or only reads file (single end) 
+            raw fastq file. Can be either a url or local path.
+            Note: For GLDS raw data assets, either the filelisting json API or the OpenAPI
+            may be used to retrieve urls given the array data filename (sourced from ISA archive).
+          Example: /some/local/path OR https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-123_microarray_E-MTAB-3289.raw.1.zip?version=1
+    
+  
+        - Runsheet Column Name: read2_path
+          Data Asset Keys: ["raw reverse reads fastq GZ"]
+          Processing Usage: >-
+            The location of either the reverse reads (paired end)
+            raw fastq file. Can be either a url or local path.
+            For single end studies, this should be an empty string.
+            Note: For GLDS raw data assets, either the filelisting json API or the OpenAPI
+            may be used to retrieve urls given the array data filename (sourced from ISA archive).
+          Example: /some/local/path OR https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-123_microarray_E-MTAB-3289.raw.1.zip?version=1
+
+ISA Meta:
+  Valid Study Assay Technology And Measurement Types:
+    - measurement: "transcription profiling"
+      technology: "RNA Sequencing (RNA-Seq)"
+
+  # this is prepended to all file names in the curation assay table
+  Global file prefix: "{datasystem}_rna_seq_"
+
+  # configuration related to updating investigation file
+  # each must refer to a STUDY PROCESS in the 'ISA_investigation.yaml' file
+  # LEADCAP_organism should be the studied organisms scientific name with a leading cap
+  Post Processing Add Study Protocol: 
+    GeneLab RNAseq data processing protocol::{LEADCAP_organism} V1
+
+data assets:
+  runsheet:
+    processed location: 
+      - "Metadata"
+      - "{dataset}_bulkRNASeq_v1_runsheet.csv"
+
+    tags:
+      - raw
+
+    resource categories: *neverPublished
+
+  ISA Archive:
+    processed location: 
+      - "Metadata"
+      - "*-ISA.zip"
+
+    tags:
+      - raw
+
+    resource categories: *neverPublished
+
+  raw MultiQC directory:
+    processed location: 
+      - *rawDataDir
+      - *FastQC_Reports
+      - "raw_multiqc_report"
+
+    tags:
+      - raw
+
+    resource categories: *neverPublished
+
+  raw MultiQC directory ZIP:
+    processed location: 
+      - *rawDataDir
+      - *FastQC_Reports
+      - "raw_multiqc_report.zip"
+    
+    tags:
+      - raw
+
+    resource categories: &MergedSequenceData_MultiQCReports
+      subcategory: Merged Sequence Data
+      subdirectory: Multiqc Reports
+      publish to repo: true
+      include subdirectory in table: true
+      table order: 1
+
+  raw forward reads fastq GZ:
+    processed location:
+      - *rawDataDir
+      - "Fastq"
+      - "{sample}_R1_raw.fastq.gz"
+
+    tags:
+      - raw
+
+    resource categories: &MergedSequenceData_Fastq
+      subcategory: Merged Sequence Data
+      subdirectory: Fastq
+      publish to repo: true
+      include subdirectory in table: false
+      table order: 0
+
+  raw reverse reads fastq GZ:
+    processed location:
+      - *rawDataDir
+      - "Fastq"
+      - "{sample}_R2_raw.fastq.gz"
+
+    tags:
+      - raw
+      
+    resource categories: *MergedSequenceData_Fastq
+
+  raw reads fastq GZ:
+    processed location:
+      - *rawDataDir
+      - "Fastq"
+      - "{sample}_raw.fastq.gz"
+
+    tags:
+      - raw
+      
+    resource categories: *MergedSequenceData_Fastq
+
+  raw forward reads fastQC HTML:
+    processed location:
+      - *rawDataDir
+      - *FastQC_Reports
+      - "{sample}_R1_raw_fastqc.html"
+
+    tags:
+      - raw
+      
+    resource categories: *neverPublished
+
+  # J.Oribello: We should revisit this, fastQC includes some unique (not parsed
+  # into multiQC) relevant information like the actual overrepresented sequence strings
+  raw reverse reads fastQC HTML:
+    processed location:
+      - *rawDataDir
+      - *FastQC_Reports
+      - "{sample}_R2_raw_fastqc.html"
+
+    tags:
+      - raw
+      
+    resource categories: *neverPublished
+
+  raw reads fastQC HTML:
+    processed location:
+      - *rawDataDir
+      - *FastQC_Reports
+      - "{sample}_raw_fastqc.html"
+
+    tags:
+      - raw
+      
+    resource categories: *neverPublished
+
+  raw forward reads fastQC ZIP: 
+    processed location:
+      - *rawDataDir
+      - *FastQC_Reports
+      - "{sample}_R1_raw_fastqc.zip"
+
+    tags:
+      - raw
+      
+    resource categories: *neverPublished
+
+  raw reverse reads fastQC ZIP:
+    processed location:
+      - *rawDataDir
+      - *FastQC_Reports
+      - "{sample}_R2_raw_fastqc.zip"
+
+    tags:
+      - raw
+      
+    resource categories: *neverPublished
+
+  raw reads fastQC ZIP:
+    processed location:
+      - *rawDataDir
+      - *FastQC_Reports
+      - "{sample}_raw_fastqc.zip"
+
+    tags:
+      - raw
+      
+    resource categories: *neverPublished
+
+  trimmed fastQC MultiQC directory:
+    processed location: 
+      - *trimDataDir
+      - *FastQC_Reports
+      - "trimmed_multiqc_report"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  trimmed fastQC MultiQC directory ZIP:
+    processed location: 
+      - *trimDataDir
+      - *FastQC_Reports
+      - "trimmed_multiqc_report.zip"
+
+    tags:
+      - processed
+      
+    resource categories: &TrimmedSequenceData_MultiQCReports
+      subcategory: Trimmed Sequence Data
+      subdirectory: Multiqc Reports
+      publish to repo: true
+      include subdirectory in table: true
+      table order: 4
+
+  trimmed forward reads fastq GZ: &trimmedFastqGZ
+    processed location:
+      - *trimDataDir
+      - "Fastq"
+      - "{sample}_R1_trimmed.fastq.gz"
+
+    tags:
+      - processed
+      
+    resource categories:
+      subcategory: Trimmed Sequence Data
+      subdirectory: Fastq
+      publish to repo: true
+      include subdirectory in table: false
+      table order: 3
+
+  trimmed reverse reads fastq GZ:
+    <<: *trimmedFastqGZ
+    processed location:
+      - *trimDataDir
+      - "Fastq"
+      - "{sample}_R2_trimmed.fastq.gz"
+
+    tags:
+      - processed
+      
+  trimmed reads fastq GZ:
+    <<: *trimmedFastqGZ
+    processed location:
+      - *trimDataDir
+      - "Fastq"
+      - "{sample}_trimmed.fastq.gz"
+
+    tags:
+      - processed
+      
+  trimmed forward reads fastQC HTML: &trimmedForwardReadsFastQCHTML
+    processed location:
+      - *trimDataDir
+      - *FastQC_Reports
+      - "{sample}_R1_trimmed_fastqc.html"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  trimmed reverse reads fastQC HTML: 
+    <<: *trimmedForwardReadsFastQCHTML
+    processed location:
+      - *trimDataDir
+      - *FastQC_Reports
+      - "{sample}_R2_trimmed_fastqc.html"
+
+    tags:
+      - processed
+      
+  trimmed reads fastQC HTML: 
+    <<: *trimmedForwardReadsFastQCHTML
+    processed location:
+      - *trimDataDir
+      - *FastQC_Reports
+      - "{sample}_trimmed_fastqc.html"
+
+    tags:
+      - processed
+      
+  trimmed forward reads fastQC ZIP: &trimmedForwardReadsFastQCZIP
+    processed location:
+      - *trimDataDir
+      - *FastQC_Reports
+      - "{sample}_R1_trimmed_fastqc.zip"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  trimmed reverse reads fastQC ZIP: 
+    <<: *trimmedForwardReadsFastQCZIP
+    processed location:
+      - *trimDataDir
+      - *FastQC_Reports
+      - "{sample}_R2_trimmed_fastqc.zip"
+
+    tags:
+      - processed
+      
+  trimmed reads fastQC ZIP: 
+    <<: *trimmedForwardReadsFastQCZIP
+    processed location:
+      - *trimDataDir
+      - *FastQC_Reports
+      - "{sample}_trimmed_fastqc.zip"
+
+    tags:
+      - processed
+      
+  trimming MultiQC directory:
+    processed location: 
+      - *trimDataDir
+      - &trimmingReportsDir "Trimming_Reports"
+      - "trimming_multiqc_report"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  forward reads trimming report: &trimmedForwardReadsFastQCTrimmingReport
+    processed location:
+      - *trimDataDir
+      - *trimmingReportsDir
+      - "{sample}_R1_raw.fastq.gz_trimming_report.txt"
+
+    tags:
+      - processed
+      
+    resource categories: 
+      subcategory: Trimmed Sequence Data
+      subdirectory: Trimming Reports
+      publish to repo: true
+      include subdirectory in table: true
+      table order: 5
+
+  reverse reads trimming report: 
+    <<: *trimmedForwardReadsFastQCTrimmingReport
+    processed location:
+      - *trimDataDir
+      - *trimmingReportsDir
+      - "{sample}_R2_raw.fastq.gz_trimming_report.txt"
+
+    tags:
+      - processed
+      
+  reads trimming report: 
+    <<: *trimmedForwardReadsFastQCTrimmingReport
+    processed location:
+      - *trimDataDir
+      - *trimmingReportsDir
+      - "{sample}_raw.fastq.gz_trimming_report.txt"
+
+    tags:
+      - processed
+      
+  aligned MultiQC directory:
+    processed location: 
+      - *alignDataDir
+      - "align_multiqc_report"
+
+    resource categories: *neverPublished
+
+    tags:
+      - processed
+      
+  aligned MultiQC directory ZIP:
+    processed location: 
+      - *alignDataDir
+      - "align_multiqc_report.zip"
+
+    tags:
+      - processed
+      
+    resource categories: &AlignedSequenceData_MultiQCReports
+      subcategory: Aligned Sequence Data # RENAME: from 'Aligned sequence data'. For consistency with Title casing across the board
+      subdirectory: MultiQC Reports # RENAME: from 'MultiQC Reports'. For consistency with Title casing across the board
+      publish to repo: true
+      include subdirectory in table: true
+      table order: 8
+
+  aligned ToTranscriptome Bam:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_Aligned.toTranscriptome.out.bam"
+
+    tags:
+      - processed
+      
+    resource categories: &AlignedSequenceData_AlignedData
+      subcategory: Aligned Sequence Data
+      subdirectory: Aligned Data
+      publish to repo: true
+      include subdirectory in table: false
+      table order: 6
+
+  aligned SortedByCoord Bam:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_Aligned.sortedByCoord.out.bam"
+
+    tags:
+      - processed
+      
+    resource categories:  *neverPublished
+
+  aligned SortedByCoord ResortedBam:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_Aligned.sortedByCoord_sorted.out.bam"
+
+    tags:
+      - processed
+      
+    resource categories: *AlignedSequenceData_AlignedData
+
+  aligned SortedByCoord ResortedBamIndex:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_Aligned.sortedByCoord_sorted.out.bam.bai"
+
+    tags:
+      - processed
+      
+    resource categories: *AlignedSequenceData_AlignedData
+
+  aligned log Final:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_Log.final.out"
+
+    tags:
+      - processed
+      
+    resource categories: &AlignedSequenceData_AlignmentLogs
+      subcategory: Aligned Sequence Data
+      subdirectory: Alignment Logs
+      publish to repo: true
+      include subdirectory in table: true
+      table order: 7
+
+  aligned log Progress:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_Log.progress.out"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  aligned log Full:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_Log.out"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  aligned sjTab:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_SJ.out.tab"
+
+    tags:
+      - processed
+      
+    resource categories: *AlignedSequenceData_AlignedData
+
+  genebody coverage MultiQC directory:
+    processed location: 
+      - *rseqcDataDir
+      - "02_geneBody_coverage"
+      - "geneBody_cov_multiqc_report"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  genebody coverage MultiQC directory ZIP:
+    processed location: 
+      - *rseqcDataDir
+      - "02_geneBody_coverage"
+      - "geneBody_cov_multiqc_report.zip"
+
+    tags:
+      - processed
+      
+    resource categories: &RSeQC_MultiQCReports
+      subcategory: RSeQC
+      subdirectory: MultiQC Reports
+      publish to repo: true
+      include subdirectory in table: true
+      table order: 9
+
+  infer experiment MultiQC directory:
+    processed location: 
+      - *rseqcDataDir
+      - "03_infer_experiment"
+      - "infer_exp_multiqc_report"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  infer experiment MultiQC directory ZIP:
+    processed location: 
+      - *rseqcDataDir
+      - "03_infer_experiment"
+      - "infer_exp_multiqc_report.zip"
+
+    tags:
+      - processed
+      
+    resource categories: *RSeQC_MultiQCReports
+
+  inner distance MultiQC directory:
+    processed location: 
+      - *rseqcDataDir
+      - "04_inner_distance"
+      - "inner_dist_multiqc_report"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  inner distance MultiQC directory ZIP:
+    processed location: 
+      - *rseqcDataDir
+      - "04_inner_distance"
+      - "inner_dist_multiqc_report.zip"
+
+    tags:
+      - processed
+      
+    resource categories: *RSeQC_MultiQCReports
+
+  read distribution MultiQC directory:
+    processed location: 
+      - *rseqcDataDir
+      - "05_read_distribution"
+      - "read_dist_multiqc_report"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  read distribution MultiQC directory ZIP:
+    processed location: 
+      - *rseqcDataDir
+      - "05_read_distribution"
+      - "read_dist_multiqc_report.zip"
+
+    tags:
+      - processed
+      
+    resource categories: *RSeQC_MultiQCReports
+
+  genebody coverage out:
+    processed location: 
+      - *rseqcDataDir
+      - "02_geneBody_coverage"
+      - "{sample}"
+
+    tags:
+      - processed
+      
+    # TODO: DISCUSS Consider this for directories that are handled the same but should validate contents
+    # is directory: true
+    # contents:
+    #   - ["{sample}.geneBodyCoverage.r"]
+    #   - ["{sample}.geneBodyCoverage.txt"]
+    #   - ["{sample}.geneBodyCoverage.curves.pdf"]
+
+    resource categories: *neverPublished
+
+  infer experiment out:
+    processed location: 
+      - *rseqcDataDir
+      - "03_infer_experiment"
+      - "{sample}_infer_expt.out"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  inner distance out:
+    processed location: 
+      - *rseqcDataDir
+      - "04_inner_distance"
+      - "{sample}"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  read distribution out:
+    processed location: 
+      - *rseqcDataDir
+      - "05_read_distribution"
+      - "{sample}_read_dist.out"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  RSEM counts MultiQC directory:
+    processed location: 
+      - *countsDataDir
+      - "RSEM_count_multiqc_report" # RENAMED from count_multiqc_report  as of 4/14/2022
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  RSEM counts MultiQC directory ZIP:
+    processed location: 
+      - *countsDataDir
+      - "RSEM_count_multiqc_report.zip"
+
+    tags:
+      - processed
+      
+    resource categories: &RawCountsData_MultiQCReports
+      subcategory: Raw Counts Data
+      subdirectory: Multiqc Reports
+      publish to repo: true
+      include subdirectory in table: true
+      table order: 11
+
+  star number non-zero count genes table:
+    processed location: 
+      - *alignDataDir
+      - "STAR_NumNonZeroGenes.csv"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  star unnormalized counts table:
+    processed location: 
+      - *alignDataDir
+      - "STAR_Unnormalized_Counts.csv"
+
+    tags:
+      - processed
+      
+    resource categories: &RawCountsTables
+      subcategory: Raw Counts Tables
+      subdirectory: ""
+      publish to repo: true
+      include subdirectory in table: false
+      table order: 12
+
+  rsem number non-zero count genes table:
+    processed location: 
+      - *countsDataDir
+      - "RSEM_NumNonZeroGenes.csv"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  rsem unnormalized counts table:
+    processed location: 
+      - *countsDataDir
+      - "RSEM_Unnormalized_Counts.csv" # RENAMED from 'Unnormalized_Counts.csv'
+
+    tags:
+      - processed
+      
+    resource categories: *RawCountsTables
+
+  sample reads per gene table:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_ReadsPerGene.out.tab"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished # TODO: Discuss, should this be repo published? In what way?
+
+  sample gene counts table:
+    processed location: 
+      - *countsDataDir
+      # Removed - "{sample}", DISCUSS: Since this directory contains multiple files per sample, should this be nested in sample-wise dirs consistent with STAR and RSeQC. J.O.
+      - "{sample}.genes.results"
+
+    tags:
+      - processed
+      
+    resource categories: &RawCountsData_CountData
+      subcategory: Raw Counts Data
+      subdirectory: Count Data
+      publish to repo: true
+      include subdirectory in table: false
+      table order: 10
+
+  sample isoform counts table:
+    processed location: 
+      - *countsDataDir
+      # Removed - "{sample}", DISCUSS: Since this directory contains multiple files per sample, should this be nested in sample-wise dirs consistent with STAR and RSeQC. J.O.
+      - "{sample}.isoforms.results"
+
+    tags:
+      - processed
+            
+    resource categories: *RawCountsData_CountData
+
+  sample counts stats directory:
+    processed location: 
+      - *countsDataDir
+      # Removed - "{sample}", DISCUSS: Since this directory contains multiple files per sample, should this be nested in sample-wise dirs consistent with STAR and RSeQC. J.O.
+      - "{sample}.stat"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  DESeq2 normalized counts table:
+    processed location: 
+      - *normCountsDataDir
+      - "Normalized_Counts.csv"
+
+    tags:
+      - processed
+            
+    resource categories: &normalizedCountsData
+      subcategory: Normalized Counts Data
+      subdirectory: ""
+      publish to repo: true
+      include subdirectory in table: false
+      table order: 13
+
+  ERCC normalized DESeq2 normalized counts table:
+    processed location: 
+      - *normCountsDataDir
+      - "ERCC_Normalized_Counts.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *normalizedCountsData
+
+  sample table:
+    processed location: 
+      - *DGEDataDir
+      - "SampleTable.csv"
+
+    tags:
+      - processed
+            
+    resource categories: &DGEAnalysisData
+      subcategory: Differential Expression Analysis Data
+      subdirectory: ""
+      publish to repo: true
+      include subdirectory in table: false
+      table order: 14
+
+  ERCC sample table:
+    processed location: 
+      - *DGEDataDir
+      - &erccSubDir "ERCC_NormDGE"
+      - "ERCCnorm_SampleTable.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *DGEAnalysisData
+
+  DESeq2 unnormalized counts table:
+    processed location: 
+      - *normCountsDataDir
+      - "RSEM_Unnormalized_Counts.csv" # RENAMED: from "Unnormalized_Counts.csv" for clarity
+
+    tags:
+      - processed
+            
+    resource categories: *neverPublished # DISCUSS: temporary name clash resolution for publishables
+
+  DESeq2 contrasts table:
+    processed location: 
+      - *DGEDataDir
+      - "contrasts.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *DGEAnalysisData
+
+  ERCC normalized DESeq2 contrasts table:
+    processed location: 
+      - *DGEDataDir
+      - *erccSubDir
+      - "ERCCnorm_contrasts.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *DGEAnalysisData
+
+  DESeq2 annotated DGE table:
+    processed location: 
+      - *DGEDataDir
+      - "differential_expression.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *DGEAnalysisData
+
+  ERCC normalized DESeq2 annotated DGE table:
+    processed location: 
+      - *DGEDataDir
+      - *erccSubDir
+      - "ERCCnorm_differential_expression.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *DGEAnalysisData
+
+  DESeq2 annotated DGE extended for viz table:
+    processed location: 
+      - *DGEDataDir
+      - "visualization_output_table.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *neverPublished
+
+  ERCC normalized DESeq2 annotated DGE extended for viz table:
+    processed location: 
+      - *DGEDataDir
+      - *erccSubDir
+      - "visualization_output_table_ERCCnorm.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *neverPublished
+
+  DESeq2 viz PCA table:
+    processed location: 
+      - *DGEDataDir
+      - "visualization_PCA_table.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *neverPublished
+
+  ERCC normalized DESeq2 viz PCA table:
+    processed location: 
+      - *DGEDataDir
+      - *erccSubDir
+      - "visualization_PCA_table_ERCCnorm.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *neverPublished
+
+
+  ERCC analysis HTML:
+    processed location: 
+      - *ERCCAnalysisDir
+      - "ERCC_analysis.html"
+
+    tags:
+      - processed
+
+    conditional on dataset:
+      - has_ERCC: [True]
+            
+    resource categories:
+      subcategory: ERCC Analyses
+      subdirectory: ""
+      publish to repo: true
+      include subdirectory in table: false
+      table order: 15
+
+    # NOTE: this is while the ERCC analysis sits outside the full pipeline and
+    # once incoporated, it should be validated for existence!
+    validate exists: false 
+
+# Assets that are no longer generated by the latest pipeline
+Archived Data Assets:
+
+  # DISCUSS: When Trim Galore MQC if made clearer, publishing this should be revisited
+  # Currently this only reports the direct cutadapt related trimming and misses Trim-Galore
+  # Specific metrics.
+  # - Jonathan Oribello
+  trimming MultiQC directory ZIP:
+    processed location: 
+      - *trimDataDir
+      - *trimmingReportsDir
+      - "trimming_multiqc_report.zip"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+
+data asset sets:
+  # These assets are not generated in the workflow, but are generated after the workflow
+  PUTATIVE:
+    - "ERCC analysis HTML"
+  glds metadata:
+    - "ISA Archive"
+  has ercc:
+    - "ERCC normalized DESeq2 normalized counts table"
+    - "ERCC sample table"
+    - "ERCC normalized DESeq2 contrasts table"
+    - "ERCC normalized DESeq2 annotated DGE table"
+    - "ERCC normalized DESeq2 annotated DGE extended for viz table"
+    - "ERCC normalized DESeq2 viz PCA table"
+    # NOTE: Not part of NF_WF yet - "ERCC analysis HTML"
+  demuliplexed paired end raw data:
+    - "runsheet"
+    - "raw forward reads fastq GZ"
+    - "raw reverse reads fastq GZ"
+  qc reports for paired end raw data:
+    - "raw forward reads fastQC HTML"
+    - "raw reverse reads fastQC HTML"
+    - "raw forward reads fastQC ZIP"
+    - "raw reverse reads fastQC ZIP"
+    - "raw MultiQC directory"
+    - "raw MultiQC directory ZIP"
+  paired end trimmed reads:
+    - "trimmed forward reads fastq GZ"
+    - "trimmed reverse reads fastq GZ"
+  qc reports for paired end trimmed reads data:
+    - "trimmed forward reads fastQC HTML"
+    - "trimmed reverse reads fastQC HTML"
+    - "trimmed forward reads fastQC ZIP"
+    - "trimmed reverse reads fastQC ZIP"
+    - "trimmed fastQC MultiQC directory"
+    - "trimming MultiQC directory"
+    - "forward reads trimming report"
+    - "reverse reads trimming report"
+  demuliplexed single end raw data:
+    - "runsheet"
+    - "raw reads fastq GZ"
+  qc reports for single end raw data:
+    - "raw reads fastQC HTML"
+    - "raw reads fastQC ZIP"
+    - "raw MultiQC directory"
+    - "raw MultiQC directory ZIP"
+  single end trimmed reads:
+    - "trimmed reads fastq GZ"
+  qc reports for single end trimmed reads data:
+    - "trimmed reads fastQC HTML"
+    - "trimmed reads fastQC ZIP"
+    - "trimmed fastQC MultiQC directory"
+    - "trimming MultiQC directory"
+    - "reads trimming report"
+  STAR alignments:
+    - "aligned MultiQC directory"
+    - "aligned MultiQC directory ZIP"
+    - "aligned ToTranscriptome Bam"
+    - "aligned SortedByCoord Bam"
+    - "aligned SortedByCoord ResortedBam"
+    - "aligned SortedByCoord ResortedBamIndex"
+    - "aligned log Final"
+    - "aligned log Progress"
+    - "aligned log Full"
+    - "aligned sjTab"
+    - "sample reads per gene table"
+    - "star number non-zero count genes table"
+    - "star unnormalized counts table"
+  RSeQC output for paired end data:
+    - "genebody coverage MultiQC directory"
+    - "genebody coverage MultiQC directory ZIP"
+    - "infer experiment MultiQC directory"
+    - "infer experiment MultiQC directory ZIP"
+    - "inner distance MultiQC directory"
+    - "inner distance MultiQC directory ZIP"
+    - "read distribution MultiQC directory"
+    - "read distribution MultiQC directory ZIP"
+    - "genebody coverage out"
+    - "infer experiment out"
+    - "inner distance out"
+    - "read distribution out"
+  RSeQC output for single end data:
+    - "genebody coverage MultiQC directory"
+    - "genebody coverage MultiQC directory ZIP"
+    - "infer experiment MultiQC directory"
+    - "infer experiment MultiQC directory ZIP"
+    - "read distribution MultiQC directory"
+    - "read distribution MultiQC directory ZIP"
+    - "genebody coverage out"
+    - "infer experiment out"
+    - "read distribution out"
+  RSEM counts:
+    - "RSEM counts MultiQC directory"
+    - "RSEM counts MultiQC directory ZIP"
+    - "rsem number non-zero count genes table"
+    - "rsem unnormalized counts table"
+    - "sample gene counts table"
+    - "sample isoform counts table"
+    - "sample counts stats directory"
+  is single end full:
+    - "runsheet"
+    - "ISA Archive"
+    - "raw MultiQC directory"
+    - "raw MultiQC directory ZIP"
+    - "raw reads fastq GZ"
+    - "raw reads fastQC HTML"
+    - "raw reads fastQC ZIP"
+    - "trimmed fastQC MultiQC directory"
+    - "trimmed fastQC MultiQC directory ZIP"
+    - "trimmed reads fastq GZ"
+    - "trimmed reads fastQC HTML"
+    - "trimmed reads fastQC ZIP"
+    - "trimming MultiQC directory"
+    - "reads trimming report"
+    - "aligned MultiQC directory"
+    - "aligned MultiQC directory ZIP"
+    - "aligned ToTranscriptome Bam"
+    - "aligned SortedByCoord Bam"
+    - "aligned SortedByCoord ResortedBam"
+    - "aligned SortedByCoord ResortedBamIndex"
+    - "aligned log Final"
+    - "aligned log Progress"
+    - "aligned log Full"
+    - "aligned sjTab"
+    - "genebody coverage MultiQC directory"
+    - "genebody coverage MultiQC directory ZIP"
+    - "infer experiment MultiQC directory"
+    - "infer experiment MultiQC directory ZIP"
+    - "read distribution MultiQC directory"
+    - "read distribution MultiQC directory ZIP"
+    - "genebody coverage out"
+    - "infer experiment out"
+    - "read distribution out"
+    - "RSEM counts MultiQC directory"
+    - "RSEM counts MultiQC directory ZIP"
+    - "star number non-zero count genes table"
+    - "star unnormalized counts table"
+    - "rsem number non-zero count genes table"
+    - "rsem unnormalized counts table"
+    - "sample reads per gene table"
+    - "sample gene counts table"
+    - "sample isoform counts table"
+    - "sample counts stats directory"
+    - "DESeq2 normalized counts table"
+    - "sample table"
+    - "DESeq2 unnormalized counts table"
+    - "DESeq2 contrasts table"
+    - "DESeq2 annotated DGE table"
+    - "DESeq2 annotated DGE extended for viz table"
+    - "DESeq2 viz PCA table"
+  is paired end full:
+    - "runsheet"
+    - "ISA Archive"
+    - "raw MultiQC directory"
+    - "raw MultiQC directory ZIP"
+    - "raw forward reads fastq GZ"
+    - "raw reverse reads fastq GZ"
+    - "raw forward reads fastQC HTML"
+    - "raw reverse reads fastQC HTML"
+    - "raw forward reads fastQC ZIP"
+    - "raw reverse reads fastQC ZIP"
+    - "trimmed fastQC MultiQC directory"
+    - "trimmed fastQC MultiQC directory ZIP"
+    - "trimmed forward reads fastq GZ"
+    - "trimmed reverse reads fastq GZ"
+    - "trimmed forward reads fastQC HTML"
+    - "trimmed reverse reads fastQC HTML"
+    - "trimmed forward reads fastQC ZIP"
+    - "trimmed reverse reads fastQC ZIP"
+    - "trimming MultiQC directory"
+    - "forward reads trimming report"
+    - "reverse reads trimming report"
+    - "aligned MultiQC directory"
+    - "aligned MultiQC directory ZIP"
+    - "aligned ToTranscriptome Bam"
+    - "aligned SortedByCoord Bam"
+    - "aligned SortedByCoord ResortedBam"
+    - "aligned SortedByCoord ResortedBamIndex"
+    - "aligned log Final"
+    - "aligned log Progress"
+    - "aligned log Full"
+    - "aligned sjTab"
+    - "genebody coverage MultiQC directory"
+    - "genebody coverage MultiQC directory ZIP"
+    - "infer experiment MultiQC directory"
+    - "infer experiment MultiQC directory ZIP"
+    - "inner distance MultiQC directory"
+    - "inner distance MultiQC directory ZIP"
+    - "read distribution MultiQC directory"
+    - "read distribution MultiQC directory ZIP"
+    - "genebody coverage out"
+    - "infer experiment out"
+    - "inner distance out"
+    - "read distribution out"
+    - "RSEM counts MultiQC directory"
+    - "RSEM counts MultiQC directory ZIP"
+    - "star number non-zero count genes table"
+    - "star unnormalized counts table"
+    - "rsem number non-zero count genes table"
+    - "rsem unnormalized counts table"
+    - "sample reads per gene table"
+    - "sample gene counts table"
+    - "sample isoform counts table"
+    - "sample counts stats directory"
+    - "DESeq2 normalized counts table"
+    - "sample table"
+    - "DESeq2 unnormalized counts table"
+    - "DESeq2 contrasts table"
+    - "DESeq2 annotated DGE table"
+    - "DESeq2 annotated DGE extended for viz table"
+    - "DESeq2 viz PCA table"
+  DGE Output:
+    - "DESeq2 normalized counts table"
+    - "sample table"
+    - "DESeq2 unnormalized counts table"
+    - "DESeq2 contrasts table"
+    - "DESeq2 annotated DGE table"
+    - "DESeq2 annotated DGE extended for viz table"
+    - "DESeq2 viz PCA table"
+  ERCC DGE Output:
+    - "ERCC normalized DESeq2 normalized counts table"
+    - "ERCC sample table"
+    - "ERCC normalized DESeq2 contrasts table"
+    - "ERCC normalized DESeq2 annotated DGE table"
+    - "ERCC normalized DESeq2 annotated DGE extended for viz table"
+    - "ERCC normalized DESeq2 viz PCA table"
+    # NOTE: Not part of NF_WF yet - "ERCC analysis HTML"
+  RSEM Output:
+    - "RSEM counts MultiQC directory"
+    - "RSEM counts MultiQC directory ZIP"
+    - "rsem number non-zero count genes table"
+    - "rsem unnormalized counts table"
diff --git a/download_multiqc_from_OSD.py b/download_multiqc_from_OSD.py
new file mode 100644
index 0000000..f94f588
--- /dev/null
+++ b/download_multiqc_from_OSD.py
@@ -0,0 +1,54 @@
+import sys
+from pathlib import Path
+import zipfile
+
+import requests
+import click
+
+from dp_tools.glds_api.commons import find_matching_filenames, retrieve_file_url
+
+@click.command()
+@click.option("--osd-id", help='OSD Accession ID. e.g. "OSD-194"', required=True)
+@click.option("--output-dir", help='Output directory', required=False, default=".")
+def main(osd_id, output_dir):
+
+
+    files = find_matching_filenames(accession=osd_id, filename_pattern=".*multiqc.*.zip")
+    # Ensure we also download ISA archive
+    files.extend(
+        find_matching_filenames(accession=osd_id, filename_pattern=".*ISA.*.zip")
+    )
+
+    if not any(["align_" in f for f in files]):
+        print(
+            "Did not locate align_multiqc_report zip.  Inferring this isn't a dataset to be used."
+        )
+        sys.exit(0)
+
+
+    def download_file(url, local_filename):
+        print(f"Saving file: {local_filename} from {url}")
+        with requests.get(url, stream=True) as r:
+            r.raise_for_status()
+            with open(local_filename, "wb") as f:
+                for chunk in r.iter_content(chunk_size=8192):
+                    f.write(chunk)
+
+    def unzip_file(zip_file_name, extraction_location):
+        print(f"Unzipping file: {zip_file_name}")
+        # open the zip file in read mode
+        with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
+            # extract all the contents into the directory
+            zip_ref.extractall(extraction_location)
+
+    # Setup output dir
+    output_dir = Path(output_dir)
+    if not output_dir.exists():
+        output_dir.mkdir()
+    for f in files:
+        file_location = output_dir / f
+        download_file(retrieve_file_url(osd_id, f), file_location)
+        unzip_file(file_location, output_dir)
+
+if __name__ == '__main__':
+    main()
diff --git a/dp_tools/__init__.py b/dp_tools/__init__.py
index 902d8c4..cd404f2 100644
--- a/dp_tools/__init__.py
+++ b/dp_tools/__init__.py
@@ -28,7 +28,7 @@
 
 #### Using pip
 
-> pip install git+https://github.com/J-81/dp_tools.git@1.3.0
+> pip install git+https://github.com/J-81/dp_tools.git@1.3.4
 
 ## CLI Commands
 
@@ -74,7 +74,7 @@
 ``` bash
 # First two lines tell Singularity to run the dp_tools container in the current working directory
 singularity exec --bind $(pwd):$(pwd) \\
-  docker://quay.io/j_81/dp_tools:1.3.0 \\
+  docker://quay.io/j_81/dp_tools:1.3.4 \\
   dpt-get-isa-archive --accession GLDS-168 # command we want to run
 ```
 
@@ -82,7 +82,7 @@
 ``` bash
 # First two lines tell Singularity to run the dp_tools container in the current working directory
 singularity exec --bind $(pwd):$(pwd) \\
-  docker://quay.io/j_81/dp_tools:1.3.0 \\
+  docker://quay.io/j_81/dp_tools:1.3.4 \\
   dpt-isa-to-runsheet --accession GLDS-168 \\
                       --config-type bulkRNASeq \\
                       --config-version Latest \\
@@ -90,4 +90,4 @@
 ```
 
 """
-__version__ = "1.3.0"
+__version__ = "1.3.4"
diff --git a/dp_tools/core/check_model.py b/dp_tools/core/check_model.py
index 78703e6..043ac3a 100644
--- a/dp_tools/core/check_model.py
+++ b/dp_tools/core/check_model.py
@@ -3,72 +3,84 @@
 from contextlib import contextmanager
 import enum
 
-from typing import (
-    Callable,
-    TypedDict,
-    Union,
-    Literal
-)
+from typing import Callable, TypedDict, Union, Literal
 import pandas as pd
 
 from loguru import logger as log
 
 
-
-
 ALLOWED_DEV_EXCEPTIONS = (
     Exception  # Hooking into this with monkeypatch can be handy for testing
 )
 
-def run_manual_check(start_instruction, pass_or_fail_questions, pass_or_flag_questions) -> 'FlagEntry':
-    input(f"Manual Check Start Instructions: \n\t{start_instruction}.\nPress Enter to continue to questions..")
+
+def run_manual_check(
+    start_instruction, pass_or_fail_questions, pass_or_flag_questions
+) -> "FlagEntry":
+    input(
+        f"Manual Check Start Instructions: \n\t{start_instruction}.\nPress Enter to continue to questions.."
+    )
     top_level_code = FlagCode.GREEN
 
     def pass_or_fail_prompt(question: str):
-        ALLOWED = { # Lambda used to ensure both static and analyst responses can be supplied
-            "Y": (lambda: "Yes", FlagCode.GREEN),
-            "JF": (lambda: input("Expand on reason for failure: ").replace("\n",":::NEWLINE:::"), FlagCode.HALT),
-            "UF": (lambda: "No",FlagCode.HALT)
-        }
-        
+        ALLOWED = (
+            {  # Lambda used to ensure both static and analyst responses can be supplied
+                "Y": (lambda: "Yes", FlagCode.GREEN),
+                "JF": (
+                    lambda: input("Expand on reason for failure: ").replace(
+                        "\n", ":::NEWLINE:::"
+                    ),
+                    FlagCode.HALT,
+                ),
+                "UF": (lambda: "No", FlagCode.HALT),
+            }
+        )
+
         while True:
             try:
                 resp = ALLOWED[input(f"{question} (Y/JF/UF) : ").upper()]
-                return (resp[0](), resp[1]) # evalute in case justification is provided
+                return (resp[0](), resp[1])  # evalute in case justification is provided
             except KeyError:
                 print(f"Invalid response! Only {list(ALLOWED)} values are allowed")
                 continue
-        
+
     def pass_or_flag_prompt(question: str):
-        ALLOWED = { # Lambda used to ensure both static and analyst responses can be supplied
-            "Y": (lambda: "Yes", FlagCode.GREEN),
-            "JF": (lambda: input("Expand on reason for failure: ").replace("\n",":::NEWLINE:::"), FlagCode.RED),
-            "UF": (lambda: "No",FlagCode.RED)
-        }
-        
+        ALLOWED = (
+            {  # Lambda used to ensure both static and analyst responses can be supplied
+                "Y": (lambda: "Yes", FlagCode.GREEN),
+                "JF": (
+                    lambda: input("Expand on reason for failure: ").replace(
+                        "\n", ":::NEWLINE:::"
+                    ),
+                    FlagCode.RED,
+                ),
+                "UF": (lambda: "No", FlagCode.RED),
+            }
+        )
+
         while True:
             try:
                 resp = ALLOWED[input(f"{question} (Y/JF/UF) : ").upper()]
-                return (resp[0](), resp[1]) # evalute in case justification is provided
+                return (resp[0](), resp[1])  # evalute in case justification is provided
             except KeyError:
                 print(f"Invalid response! Only {list(ALLOWED)} values are allowed")
                 continue
-        
+
     responses: dict[str, dict[str, list[tuple[str, FlagCode]]]] = {
-        "pass/fail": {}, 
-        "pass/flag": {}, 
-    } 
+        "pass/fail": {},
+        "pass/flag": {},
+    }
     for question in pass_or_fail_questions:
-        responses['pass/fail'][question] = pass_or_fail_prompt(question)
-        if responses['pass/fail'][question][1] == FlagCode.HALT:
+        responses["pass/fail"][question] = pass_or_fail_prompt(question)
+        if responses["pass/fail"][question][1] == FlagCode.HALT:
             top_level_code = FlagCode.HALT
-        
+
     for question in pass_or_flag_questions:
-        responses['pass/flag'][question] = pass_or_flag_prompt(question)
-        if responses['pass/flag'][question][1] == FlagCode.RED:
+        responses["pass/flag"][question] = pass_or_flag_prompt(question)
+        if responses["pass/flag"][question][1] == FlagCode.RED:
             top_level_code = max([top_level_code, FlagCode.RED])
 
-    return {"code": top_level_code, "message" : str(responses)}
+    return {"code": top_level_code, "message": str(responses)}
 
 
 class FlagCode(enum.Enum):
@@ -321,7 +333,7 @@ def ancestry_is_in(self, other_list):
     class _QueuedCheck(TypedDict):
         """A queued check including checks that will be skipped"""
 
-        check_fcn: Callable[..., Union[FlagEntry,FlagEntryWithOutliers]]
+        check_fcn: Callable[..., Union[FlagEntry, FlagEntryWithOutliers]]
         """ A callable function that returns a flag entry or a string placeholder"""
 
         function: str
@@ -486,7 +498,7 @@ def add(
         config: dict = None,
         description: str = None,
         full_description: str = None,
-        automated: bool = True
+        automated: bool = True,
     ):
         """Adds the check to the queue for each payload.
         Payload can be either supplied directly on the add invocation
@@ -502,7 +514,7 @@ def add(
             description (str, optional): A description of the check function. Defaults to function name.
                 Should be used if the function name doesn't adequately describe what is being checked.
             full_description (str, optional): A long, potentially multiline description of the check function. Defaults to function name.
-                NOT included in flag table but used to 
+                NOT included in flag table but used to
         """
         # override payload with one supplied directly to run
         if payloads:
@@ -529,7 +541,7 @@ def add(
                     # don't run if either this add call specifies skip
                     # or if the component is being skipped
                     "to_run": not any([skip, self.cur_component.skip]),
-                    "automated": automated
+                    "automated": automated,
                 }
             )
         return self
@@ -540,7 +552,7 @@ def add_manual(
         start_instructions: str,
         skip: bool = False,
         pass_fail_questions: list[str] = list(),
-        pass_flag_questions: list[str] = list()
+        pass_flag_questions: list[str] = list(),
     ):
         """Adds the check to the queue for each payload.
         Payload can be either supplied directly on the add invocation
@@ -556,38 +568,59 @@ def add_manual(
             description (str, optional): A description of the check function. Defaults to function name.
                 Should be used if the function name doesn't adequately describe what is being checked.
             full_description (str, optional): A long, potentially multiline description of the check function. Defaults to function name.
-                NOT included in flag table but used to 
+                NOT included in flag table but used to
         """
         # Generate markdown style full description based on questions
-        pass_or_fail_block = '\n'.join([f"                              - {q}" for q in pass_fail_questions])
-        pass_or_flag_block = '\n'.join([f"                              - {q}" for q in pass_flag_questions])
-        pass_or_fail_section = "" if not pass_fail_questions else f"- Pass or Fail Questions:\n{pass_or_fail_block}"
-        pass_or_flag_section = "" if not pass_flag_questions else f"- Pass or Flag Questions:\n{pass_or_flag_block}"
-        full_description = textwrap.dedent(f"""
+        pass_or_fail_block = "\n".join(
+            [f"                              - {q}" for q in pass_fail_questions]
+        )
+        pass_or_flag_block = "\n".join(
+            [f"                              - {q}" for q in pass_flag_questions]
+        )
+        pass_or_fail_section = (
+            ""
+            if not pass_fail_questions
+            else f"- Pass or Fail Questions:\n{pass_or_fail_block}"
+        )
+        pass_or_flag_section = (
+            ""
+            if not pass_flag_questions
+            else f"- Pass or Flag Questions:\n{pass_or_flag_block}"
+        )
+        full_description = textwrap.dedent(
+            f"""
                         - Manual Check: {description}
                             {pass_or_fail_section}
                             {pass_or_flag_section}
-                    """)
+                    """
+        )
 
         # Sanitize questions to ensure json serializable
         # Remove any single and double quotes from body and replace with tick marks
-        pass_fail_questions = [q.replace("'","`").replace('"',"`") for q in pass_fail_questions]
-        pass_flag_questions = [q.replace("'","`").replace('"',"`") for q in pass_flag_questions]
-
+        pass_fail_questions = [
+            q.replace("'", "`").replace('"', "`") for q in pass_fail_questions
+        ]
+        pass_flag_questions = [
+            q.replace("'", "`").replace('"', "`") for q in pass_flag_questions
+        ]
 
         self._manual_check_queue.append(
             {
-                "check_fcn": "MANUAL_CHECK", # type: ignore
+                "check_fcn": "MANUAL_CHECK",  # type: ignore
                 "function": "MANUAL_CHECK",
                 "description": description,
                 "full_description": full_description,
-                "payload": {"start_instruction":start_instructions,"pass_or_fail_questions":pass_fail_questions, "pass_or_flag_questions":pass_flag_questions},
+                "payload": {
+                    "start_instruction": start_instructions,
+                    "pass_or_fail_questions": pass_fail_questions,
+                    "pass_or_flag_questions": pass_flag_questions,
+                },
                 "config": {},
                 "component": self.cur_component,
                 # don't run if either this add call specifies skip
                 # or if the component is being skipped
                 "to_run": not any([skip, self.cur_component.skip]),
-                "automated": False
+                "automated": False,
             }
         )
         return self
@@ -607,7 +640,7 @@ def queued_checks(
         CHECK_PREFIX: str = " > ",
         INDENT_CHECKS_STR: str = " ",
         include_checks_counters: bool = True,
-        WRAP_COMPONENT_NAME_CHAR: str = "'"
+        WRAP_COMPONENT_NAME_CHAR: str = "'",
     ) -> str:
         """Returns a print-friendly string describing the queued checks.
 
@@ -621,7 +654,7 @@ def queued_checks(
         Returns:
             str: A human friendly description of the queued checks.
         """
-        description_field: Literal['full_description'] | Literal['description']
+        description_field: Literal["full_description"] | Literal["description"]
         if long_description:
             description_field = "full_description"
         else:
@@ -656,7 +689,9 @@ def render_self_and_children(component: ValidationProtocol._Component) -> str:
             count_str2 = f"[{len(check_by_component[component])}"
             lead_str = f"{INDENT_STR}{COMPONENT_PREFIX}{WRAP_COMPONENT_NAME_CHAR}{component.name}{WRAP_COMPONENT_NAME_CHAR}{'-> !SKIPPED!' if component.skip else ''}"
             if include_checks_counters:
-                buffer = f"{lead_str : <55}DIRECT:{count_str2 : >4}] ALL:{count_str : >5}]"
+                buffer = (
+                    f"{lead_str : <55}DIRECT:{count_str2 : >4}] ALL:{count_str : >5}]"
+                )
             else:
                 buffer = lead_str
 
@@ -669,7 +704,8 @@ def render_self_and_children(component: ValidationProtocol._Component) -> str:
                     ]
                 )
                 check_line_print = [
-                    f"{line} x {line_count}" if line_count > 1 else line for line, line_count in check_lines.items()
+                    f"{line} x {line_count}" if line_count > 1 else line
+                    for line, line_count in check_lines.items()
                 ]
                 if check_lines:
                     buffer += "\n" + "\n".join(check_line_print)
@@ -696,7 +732,7 @@ def run(self, flag_unhandled_exceptions: bool = False):
         all_queued = self._check_queue + self._manual_check_queue
         for queued in all_queued:
             fcn = queued["check_fcn"]
-            if queued['automated']:
+            if queued["automated"]:
                 fcn_name = fcn.__name__
             else:
                 fcn_name = "MANUAL_CHECK"
@@ -746,7 +782,7 @@ def run(self, flag_unhandled_exceptions: bool = False):
                     # peel off outlier data and keep track
                     # using current component name as the top level key
                     # Type ignored since FlagEntry dicts will return None as desired for this conditional
-                    if fcn_outliers := result.pop("outliers", None): # type: ignore
+                    if fcn_outliers := result.pop("outliers", None):  # type: ignore
                         self.outliers[queued["component"].name] = (
                             self.outliers[queued["component"].name] | fcn_outliers
                         )
@@ -776,8 +812,9 @@ def run(self, flag_unhandled_exceptions: bool = False):
                         }
                     else:
                         raise RuntimeError(
-                            f"Function failed: {fcn_name} part of {queued['component']}"
+                            f"Function failed: {fcn_name} part of {queued['component']} with original error message: {e}"
                         ) from e
+
             # add result (including skip flag) to component
             queued["component"].flags.append(packed_result)
 
@@ -841,9 +878,11 @@ def report(
         # Preprocesing flags before tabulating
         df_data: list[dict] = list()
         for flag_result in unpreprocessed_df_data:
-            # Preprocess all 'message' and 'description' fit on one table line to ensure they fit on 
-            flag_result['message'] = flag_result['message'].replace("\n","::NEWLINE::")
-            flag_result['description'] = flag_result['description'].replace("\n","::NEWLINE::")
+            # Preprocess all 'message' and 'description' fit on one table line to ensure they fit on
+            flag_result["message"] = flag_result["message"].replace("\n", "::NEWLINE::")
+            flag_result["description"] = flag_result["description"].replace(
+                "\n", "::NEWLINE::"
+            )
 
             df_data.append(flag_result)
 
diff --git a/dp_tools/core/configuration.py b/dp_tools/core/configuration.py
index 78178b3..3a49fd3 100644
--- a/dp_tools/core/configuration.py
+++ b/dp_tools/core/configuration.py
@@ -60,6 +60,8 @@ def load_config(config: Union[tuple[str, str], Path]) -> dict:
             log.info(f"Loading config (direct path): {config}")
             with config.open() as f:
                 conf_full = yaml.safe_load(f)
+        case _:
+            raise ValueError(f"Invalid config type: {type(config)}. Expected either a tuple (legacy builtin configurations) or Path object.")
 
     log.debug(f"Final config loaded: {conf_full}")
 
diff --git a/dp_tools/core/entity_model.py b/dp_tools/core/entity_model.py
index 64d20d9..d4ed880 100644
--- a/dp_tools/core/entity_model.py
+++ b/dp_tools/core/entity_model.py
@@ -164,7 +164,9 @@ def _load_asset(
             try:
                 [asset] = asset.parent.glob(asset.name)
             except ValueError as exc:
-                raise ValueError(f"Failed to locate data asset using glob pattern: '{asset.name}'") from exc
+                raise ValueError(
+                    f"Failed to locate data asset using glob pattern: '{asset.name}'"
+                ) from exc
         if not putative:
             assert asset.exists(), f"Failed to load asset at path '{asset}'"
             self.loaded_assets_dicts.append(
@@ -192,7 +194,9 @@ def _load_asset(
                     "config": {},
                 }
             )
-        return DataAsset(key=key, path=asset, config=config, owner=owner, putative=putative)
+        return DataAsset(
+            key=key, path=asset, config=config, owner=owner, putative=putative
+        )
 
     @property
     def loaded_assets_report(self) -> pd.DataFrame:
diff --git a/dp_tools/core/files/isa_archive.py b/dp_tools/core/files/isa_archive.py
index 8e457c8..d2e3465 100644
--- a/dp_tools/core/files/isa_archive.py
+++ b/dp_tools/core/files/isa_archive.py
@@ -7,7 +7,7 @@
 
 import pandas as pd
 
-log = logging.getLogger(__name__)
+from loguru import logger as log
 
 ISA_INVESTIGATION_HEADERS = {
     "ONTOLOGY SOURCE REFERENCE",
@@ -43,19 +43,29 @@ def isa_investigation_subtables(isaArchive: Path) -> dict[str, pd.DataFrame]:
     [i_file] = (
         f for f in fetch_isa_files(isaArchive) if f.name.startswith("i_")
     )
-    with open(i_file, "r") as f:
-        for line in [l.rstrip() for l in f.readlines()]:
-            # search for header
-            if line in ISA_INVESTIGATION_HEADERS:
-                if key != None:
-                    tables[key] = pd.DataFrame(
-                        table_lines
-                    ).T  # each subtable is transposed in the i_file
-                    table_lines = list()
-                key = line  # set next table key
-            else:
-                tokens = line.split("\t")  # tab separated
-                table_lines.append(tokens)
+    # Default to 'utf-8'
+    try:
+        log.trace("Decoding ISA with 'utf-8")
+        with open(i_file, "r", encoding = "utf-8") as f:
+            lines = f.readlines()
+    # Fallback to "ISO-8859-1" if 'utf-8' fails
+    except UnicodeDecodeError:
+        log.warning("Failed using 'utf-8'. Decoding ISA with 'ISO-8859-1'")
+        with open(i_file, "r", encoding = "ISO-8859-1") as f:
+            lines = f.readlines()
+    for line in lines:
+        line = line.rstrip()
+        # search for header
+        if line in ISA_INVESTIGATION_HEADERS:
+            if key != None:
+                tables[key] = pd.DataFrame(
+                    table_lines
+                ).T  # each subtable is transposed in the i_file
+                table_lines = list()
+            key = line  # set next table key
+        else:
+            tokens = line.split("\t")  # tab separated
+            table_lines.append(tokens)
     tables[key] = pd.DataFrame(
         table_lines
     ).T  # each subtable is transposed in the i_file
diff --git a/dp_tools/core/post_processing.py b/dp_tools/core/post_processing.py
index 11a4604..7a08709 100644
--- a/dp_tools/core/post_processing.py
+++ b/dp_tools/core/post_processing.py
@@ -224,7 +224,7 @@ def generate_new_column_dicts(
                 )
 
         # now remap those processing sample names to their orignal names,
-        # required for joining to orignal assay table
+        # required for fing to orignal assay table
         processing_to_orignal_mapping = pd.read_csv(
             dataset.data_assets["runsheet"].path, index_col="Sample Name"
         )["Original Sample Name"].to_dict()
@@ -264,7 +264,7 @@ def generate_new_column_dicts(
     #  joining by comma
     for header, header_wise in new_cols.items():
         for sample, sample_wise in header_wise.items():
-            new_value = ", ".join(sorted(list(new_cols[header][sample])))
+            new_value = ",".join(sorted(list(new_cols[header][sample])))
 
             new_cols[header][sample] = new_value
 
diff --git a/dp_tools/core/utilites/metrics_extractor.py b/dp_tools/core/utilites/metrics_extractor.py
new file mode 100644
index 0000000..8922cd8
--- /dev/null
+++ b/dp_tools/core/utilites/metrics_extractor.py
@@ -0,0 +1,1084 @@
+# Enum for assay types
+import ast
+from enum import Enum
+import json
+import pathlib
+from pathlib import Path
+from dataclasses import dataclass
+from typing import Literal
+import numpy as np
+
+import yaml
+import pandas as pd
+from loguru import logger
+
+from dp_tools.scripts import convert
+from dp_tools.core.utilites import multiqc_tools
+
+
+class AssayType(Enum):
+    bulkRNASeq = 1
+    bulkRNASeq_VV = 2
+    scRNASeq = 3
+    spatialRNASeq = 4
+
+
+@dataclass
+class MultiQCTargetSection:
+    targets: list[Path]
+    section_name: str
+    modules: list[str]
+    jsonTarget: list[str] | Literal[False]
+
+
+class MetricsExtractor:
+    # Class Attributes
+    metrics: pd.DataFrame = pd.DataFrame()
+    mqc_metrics: pd.DataFrame = pd.DataFrame()
+    samplewise_metrics: pd.DataFrame = pd.DataFrame()
+
+    def __init__(self, targets: list[MultiQCTargetSection]):
+        self.targets = targets
+
+    # Ensure all column names are tuples
+    @staticmethod
+    def ensure_tuple(col_name, desired_length=4):
+        if isinstance(col_name, tuple):
+            # Pad with None if the tuple is smaller than desired_length
+            return col_name + (None,) * (desired_length - len(col_name))
+        else:
+            # Create a tuple of desired_length with the col_name as the first element
+            return (col_name,) + (None,) * (desired_length - 1)
+
+    def extract_general_information(self, assay_type: AssayType, yaml_file: str):
+        """This function parses data from a yaml file that is applicable on a dataset level.
+
+        Examples include:
+            OSD-#
+            GLDS-#
+            Sample Name
+            Organism
+            Tissue Type
+            Library prep method (e.g. ribo-deplete (aka: totRNA) or polyA-enriched (aka: mRNA))
+            % rRNA contamination
+            PE or SE
+            Stranded or Unstranded
+            Library prep kit
+            Data source (GeneLab generated, User-submitted, or Federated)
+        """
+        # Parse the yaml file
+        with open(yaml_file) as file:
+            data = yaml.load(file, Loader=yaml.FullLoader)
+
+        EXPECTED_KEYS = [
+            "OSD-#",
+            "GLDS-#",
+            "Sample Name",
+            "Organism",
+            "Tissue Type",
+            "Library prep method",
+            "% rRNA contamination",
+            "PE or SE",
+            "Stranded or Unstranded",
+            "Library prep kit",
+            "Data source",
+        ]
+
+        # Validate all keys and report all missing
+        missing_keys: list[str] = list()
+        for key in EXPECTED_KEYS:
+            if key not in data:
+                missing_keys.append(key)
+
+        if missing_keys:
+            raise ValueError(f"Missing keys: {missing_keys}")
+
+    def extract_sections(self):
+        def _extract_section_from_json(
+            self, section_name: str, json_file: Path, module: str
+        ):
+            # Load json data
+            with open(json_file) as file:
+                data = json.load(file)
+
+            # Note: Certain modules like RSeQC don't produce a general stats table
+            # So we need to check if it exists before trying to extract it
+            if data["report_general_stats_data"]:
+                flat_data = multiqc_tools.get_reformated_source_dict(
+                    data["report_general_stats_data"][
+                        0
+                    ]  # assumes only one module per multiQC json file
+                )
+
+                df_general_stats = pd.DataFrame(flat_data).T
+
+                # Add a section as the first part of the column MultiIndex
+                df_general_stats.columns = pd.MultiIndex.from_tuples(
+                    [
+                        (section_name, f"multiqc_{module}", "general_stats", col)
+                        for col in df_general_stats.columns
+                    ]
+                )
+
+                # Add section_name as last part of row MultiIndex
+                df_general_stats.index = df_general_stats.index.set_names(
+                    ["sample name", "sample subcomponent"]
+                )
+
+                df_general_stats.index = pd.MultiIndex.from_tuples(
+                    list(
+                        zip(
+                            df_general_stats.index.get_level_values(
+                                "sample name"
+                            ).str.replace(
+                                "-", "_"
+                            ),  # Plots data performs this conversion so we match it here
+                            df_general_stats.index.get_level_values(
+                                "sample subcomponent"
+                            ),
+                            [section_name] * len(df_general_stats.index),
+                        )
+                    )
+                ).set_names(["sample name", "sample subcomponent", "name"])
+
+                df_updated_metrics = df_general_stats
+                has_general_stats = True
+            else:
+                has_general_stats = False
+
+            # Same for plot data
+            df_plot_data = multiqc_tools.format_plots_as_dataframe(
+                data["report_plot_data"]
+            )
+            # Add section_name as last part of row MultiIndex
+            df_plot_data.index = df_plot_data.index.set_names(
+                ["sample name", "sample subcomponent"]
+            )
+
+            df_plot_data.index = pd.MultiIndex.from_tuples(
+                list(
+                    zip(
+                        df_plot_data.index.get_level_values("sample name"),
+                        df_plot_data.index.get_level_values("sample subcomponent"),
+                        [section_name] * len(df_plot_data.index),
+                    )
+                )
+            ).set_names(["sample name", "sample subcomponent", "name"])
+
+            # Add a section as the first part of the column MultiIndex
+            df_plot_data.columns = pd.MultiIndex.from_tuples(
+                [(section_name, *col) for col in df_plot_data.columns]
+            )
+
+            if has_general_stats:
+                df_updated_metrics = df_updated_metrics.merge(
+                    df_plot_data, left_index=True, right_index=True
+                )
+            else:
+                df_updated_metrics = df_plot_data
+
+            # Convert all columns to tuples
+            columns_as_tuples = df_updated_metrics.columns.map(self.ensure_tuple)
+
+            # Create MultiIndex
+            df_updated_metrics.columns = pd.MultiIndex.from_tuples(columns_as_tuples)
+
+            self.metrics = self.metrics.append(df_updated_metrics)
+
+        def _extract_section(
+            self, section_name: str, files: list[Path], modules: list[str]
+        ):
+            mqc_ret = multiqc_tools.get_parsed_data(
+                input_f=[str(f) for f in files], modules=modules, as_dataframe=False
+            )
+            flat_data = multiqc_tools.flatten_raw_data(mqc_ret["report"])
+
+            df_general_stats = pd.DataFrame(flat_data).T
+
+            # Add section_name as last part of row MultiIndex
+            df_general_stats.index = df_general_stats.index.set_names(
+                ["sample name", "sample subcomponent"]
+            )
+
+            df_general_stats.index = pd.MultiIndex.from_tuples(
+                list(
+                    zip(
+                        df_general_stats.index.get_level_values("sample name"),
+                        df_general_stats.index.get_level_values("sample subcomponent"),
+                        [section_name] * len(df_general_stats.index),
+                    )
+                )
+            ).set_names(["sample name", "sample subcomponent", "name"])
+
+            # Metrics names may include '-', whereas all multiQC names covert these to '_'
+            # So here we create a temporary column with the '-' replaced with '_' for merging purposes
+            # if isinstance(self.metrics.index, pd.MultiIndex):
+            #     idx_sample_name = self.metrics.index.get_level_values('sample name')
+            #     idx_sample_name = idx_sample_name.str.replace('-','_')
+            #     self.metrics.index = pd.MultiIndex.from_arrays(
+            #             [
+            #             idx_sample_name,
+            #             self.metrics.index.get_level_values('sample subcomponent'),
+            #             self.metrics.index.get_level_values('name'),
+            #             ],
+            #             names = ['sample name','sample subcomponent','name']
+            #         )
+            # else:
+            #     self.metrics.index = self.metrics.index.str.replace('-','_')
+            # # self.metrics.index = self.metrics.index.str.replace('-','_')
+
+            df_updated_metrics = df_general_stats
+
+            # Same for plot data
+            df_plot_data = multiqc_tools.format_plots_as_dataframe(mqc_ret)
+            # Add section_name as last part of row MultiIndex
+            df_plot_data.index = df_plot_data.index.set_names(
+                ["sample name", "sample subcomponent"]
+            )
+
+            df_plot_data.index = pd.MultiIndex.from_tuples(
+                list(
+                    zip(
+                        df_plot_data.index.get_level_values("sample name"),
+                        df_plot_data.index.get_level_values("sample subcomponent"),
+                        [section_name] * len(df_plot_data.index),
+                    )
+                )
+            ).set_names(["sample name", "sample subcomponent", "name"])
+
+            df_updated_metrics = df_updated_metrics.merge(
+                df_plot_data, left_index=True, right_index=True
+            )
+
+            # Add a section as the first part of the column MultiIndex
+            df_updated_metrics.columns = pd.MultiIndex.from_tuples(
+                [(section_name, *col) for col in df_updated_metrics.columns]
+            )
+
+            self.metrics = self.metrics.append(df_updated_metrics)
+
+        for target in self.targets:
+            if target.jsonTarget == False:
+                _extract_section(
+                    self, target.section_name, target.targets, target.modules
+                )
+            else:
+                _extract_section_from_json(
+                    self, target.section_name, target.jsonTarget, target.modules[0]
+                )  # Restriction: Can only handle one module multiQC json
+        # Convert index of three part tuple to MultiIndex
+        # Unnamed so must access part position in tuple
+        # self.metrics.index = pd.MultiIndex.from_tuples(self.metrics.index, names = ['sample name','sample subcomponent','name'])
+        # self.metrics.index = self.metrics.index.set_names(['sample name','sample subcomponent','name'])
+
+        # Merge in samplewise metrics
+        metrics_reset = self.metrics.reset_index(level=["sample subcomponent", "name"])
+
+        samplewise_metrics_cleaned = self.samplewise_metrics.copy()
+        samplewise_metrics_cleaned.index = samplewise_metrics_cleaned.index.str.replace(
+            "-", "_"
+        )
+
+        merged = metrics_reset.merge(
+            samplewise_metrics_cleaned,
+            how="left",
+            left_on="sample name",
+            right_index=True,
+        )
+        # Rename based on length of coerced tuples
+        merged = merged.rename(
+            columns={
+                ("sample subcomponent", "", "", ""): "sample subcomponent",
+                ("name", "", "", ""): "name",
+            }
+        )
+        merged = merged.set_index(["sample subcomponent", "name"], append=True)
+
+        self.metrics = merged
+
+    def extract_data_from_isa(
+        self, accession: str, isa_archive: pathlib.Path, config: tuple[str, str]
+    ):
+        class mock_schema:
+            @staticmethod
+            def validate(df):
+                pass
+
+        samplewise_metrics = convert.isa_to_runsheet(
+            accession,
+            isa_archive,
+            config,
+            schema=mock_schema(),  # type: ignore
+            assert_factor_values=False,
+        )
+
+        self.samplewise_metrics = samplewise_metrics
+
+    def append_manual_yaml_data(self, target_yaml: Path):
+        # Start with df_isa and add columns for each key value in yaml
+        with open(target_yaml) as file:
+            new_data = yaml.safe_load(file)
+
+        # Add the new data to the existing data as new columns
+        for key, value in new_data.items():
+            self.samplewise_metrics[key] = value
+
+    def process_metrics(self, assay_type: AssayType):
+        match assay_type:
+            case AssayType.bulkRNASeq:
+                df_samplewise = pd.DataFrame()
+
+                # Copy here is inefficient but useful to keep original dataframe unmodified
+                df_interim = self.metrics.copy()
+
+                # Ensure all column names are tuples
+                def ensure_tuple(col_name, desired_length=4):
+                    if isinstance(col_name, tuple):
+                        # Pad with None if the tuple is smaller than desired_length
+                        return col_name + (None,) * (desired_length - len(col_name))
+                    else:
+                        # Create a tuple of desired_length with the col_name as the first element
+                        return (col_name,) + (None,) * (desired_length - 1)
+
+                # Convert all columns to tuples
+                columns_as_tuples = df_interim.columns.map(ensure_tuple)
+
+                # Create MultiIndex
+                df_interim.columns = pd.MultiIndex.from_tuples(columns_as_tuples)
+
+                def _process_fastqc_data(df_full: pd.DataFrame, section_name: str):
+                    df_samplewise = pd.DataFrame()
+                    # Raw reads
+                    df_fastqc_subset = df_full.xs(
+                        key=(section_name, "_R1"),
+                        axis="rows",
+                        level=["name", "sample subcomponent"],
+                    )
+
+                    # M Seqs (read depth)
+                    df_samplewise.index = df_fastqc_subset.index
+                    try:
+                        df_samplewise["Total Seqs"] = df_fastqc_subset[
+                            (
+                                section_name,
+                                "multiqc_fastqc",
+                                "general_stats",
+                                "Total Sequences",
+                            )
+                        ].astype(int)
+                    except (
+                        KeyError
+                    ):  # Sometimes the key is named differently (e.g. OSD-511)
+                        df_samplewise["Total Seqs"] = df_fastqc_subset[
+                            (
+                                section_name,
+                                "multiqc_fastqc",
+                                "general_stats",
+                                "total_sequences",
+                            )
+                        ].astype(int)
+
+                    # Read length (This can be a range, especially for trimmed reads)
+                    df_samplewise["Mean Read Length"] = df_fastqc_subset[
+                        (
+                            section_name,
+                            "multiqc_fastqc",
+                            "general_stats",
+                            "avg_sequence_length",
+                        )
+                    ]
+
+                    # # Check column type and perform action
+                    # def _process_read_length_column(column):
+                    #     if np.issubdtype(column.dtype, np.number):
+                    #         return column.astype(int), column.astype(int)
+                    #     elif column.dtype == object:
+                    #         min_length = column.str.split("-").apply(
+                    #             lambda read_length_list: read_length_list[0]
+                    #         )
+                    #         max_length = column.str.split("-").apply(
+                    #             lambda read_length_list: read_length_list[1]
+                    #         )
+                    #         return min_length.astype(int), max_length.astype(int)
+                    #     raise ValueError(
+                    #         "Column type not recognized. Expected int or string like object (i.e. '20-151')"
+                    #     )
+
+                    # (
+                    #     df_samplewise["Min Read Length"],
+                    #     df_samplewise["Max Read Length"],
+                    # ) = _process_read_length_column(read_length_series)
+
+                    # Mean & Median Q Score (Across all bases)
+                    df_samplewise = df_samplewise.merge(
+                        (
+                            df_full.xs(
+                                key=section_name,
+                                axis="rows",
+                                level="name",
+                            )
+                            .xs(
+                                key=(section_name, "FastQC: Mean Quality Scores"),
+                                axis="columns",
+                                level=[0, 1],
+                            )
+                            .agg(["mean", "median"], axis="columns")
+                        ).rename(
+                            columns={
+                                "mean": "Average Q Score (Across all plotted base positions)",
+                                "median": "Median Q Score (Across all plotted base positions)",
+                            }
+                        ),
+                        left_index=True,
+                        right_index=True,
+                    )
+
+                    # % Dups
+                    try:
+                        df_samplewise = 100 - df_samplewise.merge(
+                            (
+                                df_full.xs(
+                                    key=(
+                                        section_name,
+                                        "multiqc_fastqc",
+                                        "general_stats",
+                                    ),
+                                    axis="columns",
+                                ).xs(key=(section_name), axis="rows", level="name")[
+                                    "total_deduplicated_percentage"
+                                ]
+                            ).rename("% Dups"),
+                            left_index=True,
+                            right_index=True,
+                        )
+                    except:  # Sometimes key names are different
+                        df_samplewise = df_samplewise.merge(
+                            (
+                                df_full.xs(
+                                    key=(
+                                        section_name,
+                                        "multiqc_fastqc",
+                                        "general_stats",
+                                    ),
+                                    axis="columns",
+                                ).xs(key=(section_name), axis="rows", level="name")[
+                                    "percent_duplicates"
+                                ]
+                            ).rename("% Dups"),
+                            left_index=True,
+                            right_index=True,
+                        )
+
+                    # Mean %GC
+                    try:
+                        df_samplewise = df_samplewise.merge(
+                            df_full.xs(
+                                key=(
+                                    section_name,
+                                    "multiqc_fastqc",
+                                    "general_stats",
+                                    "%GC",
+                                ),
+                                axis="columns",
+                            )
+                            .xs(key=(section_name), axis="rows", level="name")
+                            .rename("Mean %GC"),
+                            left_index=True,
+                            right_index=True,
+                        )
+                    except KeyError:  # %GC <-> percent_gc
+                        df_samplewise = df_samplewise.merge(
+                            df_full.xs(
+                                key=(
+                                    section_name,
+                                    "multiqc_fastqc",
+                                    "general_stats",
+                                    "percent_gc",
+                                ),
+                                axis="columns",
+                            )
+                            .xs(key=(section_name), axis="rows", level="name")
+                            .rename("Mean %GC"),
+                            left_index=True,
+                            right_index=True,
+                        )
+
+                    def _first_col_reaching_min(row, min_value):
+                        # Use boolean indexing to get columns that are >= min_value
+                        valid = row[row >= min_value]
+
+                        # Return the first column name that satisfies the condition, or None if not found
+                        return valid.index[0] if not valid.empty else None
+
+                    def _last_col_reaching_min(row, min_value):
+                        # Use boolean indexing to get columns that are >= min_value
+                        valid = row[row >= min_value]
+
+                        # Return the last column name that satisfies the condition, or None if not found
+                        return valid.index[-1] if not valid.empty else None
+
+                    def _get_first_column_where_cumulative_sum_exceeds_proportion_of_row_sum(
+                        row, proportion
+                    ):
+                        # Find all columns that exceed the proportion of the row sum
+                        valid = row[row.cumsum() >= proportion * row.sum()]
+
+                        # Return the first column name that satisfies the condition, or None if not found
+                        return valid.index[0] if not valid.empty else None
+
+                    df_gc_plots = df_full.xs(
+                        key=section_name,
+                        axis="rows",
+                        level="name",
+                    ).xs(
+                        key=(
+                            section_name,
+                            "FastQC: Per Sequence GC Content",
+                            "FastQC: Per Sequence GC Content",
+                        ),
+                        axis="columns",
+                        level=[0, 1, 2],
+                    )
+
+                    # Min %GC Reaching %1 Counts
+                    df_samplewise = df_samplewise.merge(
+                        (
+                            df_gc_plots.apply(
+                                _first_col_reaching_min, axis="columns", min_value=1
+                            ).apply(lambda s: s.split()[0])
+                        ).rename("Min %GC reaching 1% Counts"),
+                        left_index=True,
+                        right_index=True,
+                    )
+
+                    # Max %GC Reaching %1 Counts
+                    df_samplewise = df_samplewise.merge(
+                        (
+                            df_gc_plots.apply(
+                                _last_col_reaching_min, axis="columns", min_value=1
+                            ).apply(lambda s: s.split()[0])
+                        ).rename("Max %GC reaching 1% Counts"),
+                        left_index=True,
+                        right_index=True,
+                    )
+
+                    # 25% Quartile AUC Point %GC
+                    df_samplewise = df_samplewise.merge(
+                        (
+                            df_gc_plots.apply(
+                                _get_first_column_where_cumulative_sum_exceeds_proportion_of_row_sum,
+                                axis="columns",
+                                proportion=0.25,
+                            ).apply(lambda s: s.split()[0])
+                        ).rename("25% Quartile AUC Point %GC"),
+                        left_index=True,
+                        right_index=True,
+                    )
+                    # 50% Quartile AUC Point %GC
+                    df_samplewise = df_samplewise.merge(
+                        (
+                            df_gc_plots.apply(
+                                _get_first_column_where_cumulative_sum_exceeds_proportion_of_row_sum,
+                                axis="columns",
+                                proportion=0.50,
+                            ).apply(lambda s: s.split()[0])
+                        ).rename("50% Quartile AUC Point %GC"),
+                        left_index=True,
+                        right_index=True,
+                    )
+                    # 75% Quartile AUC Point %GC
+                    df_samplewise = df_samplewise.merge(
+                        (
+                            df_gc_plots.apply(
+                                _get_first_column_where_cumulative_sum_exceeds_proportion_of_row_sum,
+                                axis="columns",
+                                proportion=0.75,
+                            ).apply(lambda s: s.split()[0])
+                        ).rename("75% Quartile AUC Point %GC"),
+                        left_index=True,
+                        right_index=True,
+                    )
+
+                    # % N Content
+                    df_n_content_plots = df_full.xs(
+                        key=section_name,
+                        axis="rows",
+                        level="name",
+                    ).xs(
+                        key=(
+                            section_name,
+                            "FastQC: Per Base N Content",
+                            "FastQC: Per Base N Content",
+                        ),
+                        axis="columns",
+                        level=[0, 1, 2],
+                    )
+
+                    # % N Content Summed Across All Plotted Bases Positions
+                    df_samplewise = df_samplewise.merge(
+                        df_n_content_plots.sum(axis="columns").rename(
+                            "% N Content Summed Across All Plotted Bases Positions"
+                        ),
+                        left_index=True,
+                        right_index=True,
+                    )
+
+                    return df_samplewise
+
+                def _process_align_data(df_full: pd.DataFrame, section_name: str):
+                    df_samplewise = pd.DataFrame()
+
+                    df_align_subset = (
+                        df_full.xs(
+                            key=section_name,
+                            axis="rows",
+                            level="name",
+                        )
+                        .xs(key=section_name, axis="columns", level=0)
+                        .droplevel("sample subcomponent", axis="rows")
+                    )
+
+                    df_samplewise.index = df_align_subset.index
+                    df_samplewise["% Uniquely mapped"] = df_align_subset[
+                        (
+                            "multiqc_star",
+                            "general_stats",
+                            "uniquely_mapped_percent",
+                        )
+                    ].astype(float)
+
+                    df_samplewise["% Mapped to multiple loci"] = df_align_subset[
+                        ("multiqc_star", "general_stats", "multimapped_percent")
+                    ].astype(float)
+
+                    df_samplewise["% Mapped to too many loci"] = df_align_subset[
+                        (
+                            "multiqc_star",
+                            "general_stats",
+                            "multimapped_toomany_percent",
+                        )
+                    ].astype(float)
+
+                    df_samplewise["% Unmapped too short"] = df_align_subset[
+                        (
+                            "multiqc_star",
+                            "general_stats",
+                            "unmapped_tooshort_percent",
+                        )
+                    ].astype(float)
+
+                    df_samplewise["% Unmapped other"] = df_align_subset[
+                        (
+                            "multiqc_star",
+                            "general_stats",
+                            "unmapped_other_percent",
+                        )
+                    ].astype(float)
+
+                    return df_samplewise
+
+                def _process_rseqc_genebody_coverage_data(
+                    df_full: pd.DataFrame, section_name: str
+                ):
+                    df_samplewise = pd.DataFrame()
+
+                    df_rseqc_subset = (
+                        df_full.xs(
+                            key=section_name,
+                            axis="rows",
+                            level="name",
+                        )
+                        .xs(key=section_name, axis="columns", level=0)
+                        .droplevel("sample subcomponent", axis="rows")
+                    )
+
+                    df_samplewise.index = df_rseqc_subset.index
+
+                    # Average % Coverage from 5-20 percentile (5' end coverage)
+                    def _get_mean_for_percentile_range(min_range: int, max_range: int):
+                        level_0_value = "RSeQC: Gene Body Coverage"
+                        level_1_value = "RSeQC: Gene Body Coverage"
+                        level_2_values_to_select = [
+                            f"{i} Gene Body Percentile (5' -> 3') (% Coverage)"
+                            for i in range(min_range, max_range + 1)
+                        ]  # list of desired values for third level
+
+                        mask = (
+                            (
+                                df_rseqc_subset.columns.get_level_values(0)
+                                == level_0_value
+                            )
+                            & (
+                                df_rseqc_subset.columns.get_level_values(1)
+                                == level_1_value
+                            )
+                            & (
+                                df_rseqc_subset.columns.get_level_values(2).isin(
+                                    level_2_values_to_select
+                                )
+                            )
+                        )
+
+                        return (
+                            df_rseqc_subset.loc[:, mask]
+                            .astype(float)
+                            .mean(axis="columns")
+                        )
+
+                    @dataclass
+                    class TARGET_RANGE:
+                        lower_bound: int
+                        upper_bound: int
+                        label: str
+
+                    TARGET_RANGES: list[TARGET_RANGE] = [
+                        TARGET_RANGE(5, 20, "5' end coverage"),
+                        TARGET_RANGE(40, 60, "middle coverage"),
+                        TARGET_RANGE(80, 95, "3' end coverage"),
+                    ]
+
+                    for target in TARGET_RANGES:
+                        df_samplewise[
+                            f"Average % Coverage from {target.lower_bound}-{target.upper_bound} percentile ({target.label})"
+                        ] = _get_mean_for_percentile_range(
+                            target.lower_bound, target.upper_bound
+                        )
+
+                    df_samplewise["Ratio of 3' end coverage to 5' end coverage"] = (
+                        df_samplewise[
+                            f"Average % Coverage from 80-95 percentile (3' end coverage)"
+                        ]
+                        / df_samplewise[
+                            f"Average % Coverage from 5-20 percentile (5' end coverage)"
+                        ]
+                    )
+
+                    return df_samplewise
+
+                def _process_rseqc_infer_experiment_data(
+                    df_full: pd.DataFrame, section_name: str
+                ):
+                    df_samplewise = pd.DataFrame()
+
+                    df_rseqc_subset = (
+                        df_full.xs(
+                            key=section_name,
+                            axis="rows",
+                            level="name",
+                        )
+                        .xs(key=section_name, axis="columns", level=0)
+                        .droplevel("sample subcomponent", axis="rows")
+                    )
+
+                    df_samplewise["% Sense"] = df_rseqc_subset[
+                        (
+                            "RSeQC: Infer experiment",
+                            "RSeQC: Infer experiment",
+                            "Sense (% Tags)",
+                        )
+                    ].astype(float)
+
+                    df_samplewise["% Antisense"] = df_rseqc_subset[
+                        (
+                            "RSeQC: Infer experiment",
+                            "RSeQC: Infer experiment",
+                            "Antisense (% Tags)",
+                        )
+                    ].astype(float)
+
+                    df_samplewise["% Undetermined"] = df_rseqc_subset[
+                        (
+                            "RSeQC: Infer experiment",
+                            "RSeQC: Infer experiment",
+                            "Undetermined (% Tags)",
+                        )
+                    ].astype(float)
+
+                    return df_samplewise
+
+                def _process_rseqc_inner_distance_data(
+                    df_full: pd.DataFrame, section_name: str
+                ):
+                    df_samplewise = pd.DataFrame()
+
+                    df_rseqc_subset = (
+                        df_full.xs(
+                            key=section_name,
+                            axis="rows",
+                            level="name",
+                        )
+                        .xs(key=section_name, axis="columns", level=0)
+                        .droplevel("sample subcomponent", axis="rows")
+                    )
+
+                    # Inner Distance Peak Distance
+                    # Extract from column tuple
+                    # Example: ('RSeQC: Inner Distance', 'RSeQC: Inner Distance', '-117.5 Inner Distance (bp) (Counts)')
+                    # Yields: -117.5
+                    try:
+                        df_samplewise["Peak Inner Distance"] = (
+                            df_rseqc_subset.idxmax(axis="columns")
+                            .apply(lambda col: col[2])
+                            .astype(float)
+                        )
+                    except (
+                        ValueError
+                    ):  # e.g. ValueError: could not convert string to float: '-142.5 Inner Distance (bp) (Counts)'
+                        df_samplewise["Peak Inner Distance"] = (
+                            df_rseqc_subset.idxmax(axis="columns")
+                            .apply(lambda col: col[2].split()[0])
+                            .astype(float)
+                        )
+
+                    # % Reads At Inner Distance Peak Distance
+                    df_samplewise["% Reads At Peak Inner Distance"] = (
+                        df_rseqc_subset.max(axis="columns")
+                        / df_rseqc_subset.sum(axis="columns")
+                        * 100
+                    )
+
+                    # TAGUP: Inner distance at 1%
+
+                    return df_samplewise
+
+                def _process_rseqc_read_distribution_data(
+                    df_full: pd.DataFrame, section_name: str
+                ):
+                    df_samplewise = pd.DataFrame()
+
+                    df_rseqc_subset = (
+                        df_full.xs(
+                            key=section_name,
+                            axis="rows",
+                            level="name",
+                        )
+                        .xs(key=section_name, axis="columns", level=0)
+                        .droplevel("sample subcomponent", axis="rows")
+                    )
+
+                    @dataclass
+                    class TARGET_LABELS:
+                        dataframe_name: str
+                        metrics_name: str
+
+                    TARGETS: list[TARGET_LABELS] = [
+                        TARGET_LABELS("CDS_Exons (# Tags)", "% CDS_Exons"),
+                        TARGET_LABELS("5'UTR_Exons (# Tags)", "% 5'UTR_Exons"),
+                        TARGET_LABELS("3'UTR_Exons (# Tags)", "% 3'UTR_Exons"),
+                        TARGET_LABELS("Introns (# Tags)", "% Introns"),
+                        TARGET_LABELS("TSS_up_1kb (# Tags)", "% TSS_up_1kb"),
+                        TARGET_LABELS("TSS_up_1kb-5kb (# Tags)", "% TSS_up_5kb"),
+                        TARGET_LABELS("TSS_up_5kb-10kb (# Tags)", "% TSS_up_10kb"),
+                        TARGET_LABELS("TES_down_1kb (# Tags)", "% TES_down_1kb"),
+                        TARGET_LABELS("TES_down_1kb-50kb (# Tags)", "% TES_down_5kb"),
+                        TARGET_LABELS("TES_down_5kb-10kb", "% TES_down_10kb"),
+                        TARGET_LABELS(
+                            "Other_intergenic (# Tags)", "% Other_intergenic"
+                        ),
+                    ]
+
+                    for target in TARGETS:
+                        try:
+                            # Plot data
+                            df_samplewise[target.metrics_name] = df_rseqc_subset[
+                                (
+                                    "RSeQC: Read Distribution",
+                                    "RSeQC: Read Distribution",
+                                    target.dataframe_name,
+                                )
+                            ]
+                        except:
+                            # No plot data means zero for the given tag
+                            df_samplewise[target.metrics_name] = 0
+
+                    # Convert all to percents by summing across row and dividing each by sum
+                    df_samplewise = df_samplewise.apply(
+                        lambda col: col / df_samplewise.sum(axis="columns") * 100
+                    )
+
+                    return df_samplewise
+
+                df_samplewise_raw = _process_fastqc_data(df_interim, "raw reads")
+                df_samplewise_trimmed = _process_fastqc_data(
+                    df_interim, "trimmed reads"
+                )
+
+                df_samplewise_align = _process_align_data(df_interim, "aligned reads")
+
+                #df_samplewise_rseqc_genebody_coverage = (
+                #    _process_rseqc_genebody_coverage_data(
+                #        df_interim, "rseqc: genebody coverage"
+                #    )
+                #)
+
+                #df_samplewise_rseqc_infer_experiment = (
+                #    _process_rseqc_infer_experiment_data(
+                #        df_interim, "rseqc: infer experiment"
+                #    )
+                #)
+
+                #df_samplewise_rseqc_inner_distance = _process_rseqc_inner_distance_data(
+                #    df_interim, "rseqc: inner distance"
+                #)
+
+                #df_samplewise_rseqc_read_distribution = (
+                #    _process_rseqc_read_distribution_data(
+                #        df_interim, "rseqc: read distribution"
+                #    )
+                #)
+
+                # Merge all
+                df_merged = (
+                    df_samplewise_raw.merge(
+                        df_samplewise_trimmed,
+                        left_index=True,
+                        right_index=True,
+                        suffixes=(" Raw", " Trimmed"),
+                    )
+                    .merge(
+                        df_samplewise_align,
+                        left_index=True,
+                        right_index=True,
+                    )
+                    .sort_index()
+                )
+
+                return df_merged
+
+            case AssayType.bulkRNASeq_VV:
+                df_samplewise = pd.DataFrame()
+
+                # Copy here is inefficient but useful to keep original dataframe unmodified
+                df_interim = self.metrics.copy()
+
+                # Convert all columns to tuples
+                columns_as_tuples = df_interim.columns.map(self.ensure_tuple)
+
+                # Create MultiIndex
+                df_interim.columns = pd.MultiIndex.from_tuples(columns_as_tuples)
+
+                # Raw reads
+                raw_reads = df_interim.xs(
+                    key=("raw reads", "_R1"),
+                    axis="rows",
+                    level=["name", "sample subcomponent"],
+                )
+
+                # Read Depth Range
+                df_samplewise.index = raw_reads.index
+                df_samplewise["Total Seqs"] = raw_reads[
+                    ("raw reads", "multiqc_fastqc", "general_stats", "Total Sequences")
+                ].astype(int)
+
+                # Read length
+                df_samplewise["Read Length"] = raw_reads[
+                    ("raw reads", "multiqc_fastqc", "general_stats", "Sequence length")
+                ].astype(int)
+
+                # Mean & Median Q Score (Across all bases)
+                df_samplewise = df_samplewise.merge(
+                    (
+                        df_interim.xs(
+                            key="raw reads",
+                            axis="rows",
+                            level="name",
+                        )
+                        .xs(
+                            key=("raw reads", "FastQC: Mean Quality Scores"),
+                            axis="columns",
+                            level=[0, 1],
+                        )
+                        .agg(["mean", "median"], axis="columns")
+                    ).rename(
+                        columns={
+                            "mean": "Average Q Score (Across all plotted base positions)",
+                            "median": "Median Q Score (Across all plotted base positions)",
+                        }
+                    ),
+                    left_index=True,
+                    right_index=True,
+                )
+
+                # % Dups
+
+                print("DONE")
+            case _:
+                raise NotImplementedError(
+                    f"Assay type {assay_type} not implemented for summarization."
+                )
+
+    def load_metrics_csv(self, metrics_csv: Path):
+        # check\ metrics hasn't been created yet or loaded
+        assert self.metrics.equals(
+            pd.DataFrame()
+        ), "Metrics already loaded. Please create a new MetricsExtractor object."
+
+        self.metrics = pd.read_csv(metrics_csv, index_col=[0, 1, 2])
+
+        # Set index names
+        self.metrics.index = self.metrics.index.set_names(
+            ["sample name", "sample subcomponent", "name"]
+        )
+
+        # Convert column names to tuples if they represent valid tuples
+        def _convert_to_tuple_if_valid(col_name):
+            try:
+                # Check if the column name can be evaluated to a tuple
+                result = ast.literal_eval(col_name)
+                if isinstance(result, tuple):
+                    return result
+            except (SyntaxError, ValueError):
+                pass
+            return col_name
+
+        self.metrics.columns = [
+            _convert_to_tuple_if_valid(col) for col in self.metrics.columns
+        ]
+
+
+def generate_extractor_from_yaml_config(config: Path) -> MetricsExtractor:
+    with open(config) as file:
+        config_data = yaml.safe_load(file)
+
+    targets: list[MultiQCTargetSection] = list()
+
+    for section in config_data["Extraction Settings"]["sections"]:
+        if not section["enabled"]:
+            logger.info(f"Skipping {section['name']} because it is disabled.")
+            continue
+
+        # Set up MultiQC targets
+        search_dir = Path(
+            config_data["Extraction Settings"]["root search directory"]
+        ) / Path(*section["multiQC"]["logs directory"])
+
+        if section["multiQC"].get("from json", False):
+            jsonTarget = Path(
+                config_data["Extraction Settings"]["root search directory"]
+            ) / Path(*section["multiQC"]["from json"])
+        else:
+            jsonTarget = False
+
+        found_files: list[Path] = list()
+        for logs_pattern in section["multiQC"]["logs pattern(s)"]:
+            if section["multiQC"]["search recursively"]:
+                found_files.extend(list(search_dir.rglob(logs_pattern)))
+            else:
+                found_files.extend(list(search_dir.glob(logs_pattern)))
+
+        # Catch empty lists
+        if len(found_files) == 0 and not jsonTarget:
+            raise ValueError(
+                f"No files found for {section['name']}. Configuration may be broken or consider disabling section if data is not present."
+            )
+
+        targets.append(
+            MultiQCTargetSection(
+                targets=found_files,
+                section_name=section["name"],
+                modules=section["multiQC"]["modules"],
+                jsonTarget=jsonTarget,
+            )
+        )
+
+    return MetricsExtractor(targets=targets)
diff --git a/dp_tools/core/utilites/multiqc_tools.py b/dp_tools/core/utilites/multiqc_tools.py
index 7d053f5..09cf847 100644
--- a/dp_tools/core/utilites/multiqc_tools.py
+++ b/dp_tools/core/utilites/multiqc_tools.py
@@ -5,6 +5,7 @@
 from typing import List, TypedDict
 
 import logging
+
 log = logging.getLogger(__name__)
 
 import multiqc
@@ -12,15 +13,13 @@
 
 # iterable to remove suffixes and add them as subsource descriptors
 SUBSOURCES = [
-    "_R1_raw",
-    "_R2_raw",
     "_R1",
     "_R2",
     "__STARpass1",
 ]
 
 # iterable to remove suffixes that does NOT add them as subsource descriptors (often due to the name being redundantly associated with columns)
-SCRUB_SAMPLES = ["_read_dist", "_infer_expt"]
+SCRUB_SAMPLES = ["_read_dist", "_infer_expt", "_raw"]
 
 
 def clean_messy_sample(messy_sample: str):
@@ -93,12 +92,12 @@ def get_parsed_data(
     log.info(f"Using MQC to parse: {input_f}")
     try:
         # a workaround for flushing handlers in MQC version 1.11
-        logger = log.getLogger("multiqc")
-        [logger.removeHandler(h) for h in logger.handlers]
+        # logger = log.getLogger("multiqc")
+        # [logger.removeHandler(h) for h in logger.handlers]
         mqc_ret = multiqc.run(
-            input_f, 
-            no_data_dir=True, 
-            module=modules, 
+            input_f,
+            no_data_dir=True,
+            module=modules,
             quiet=True,
             no_ansi=True,
         )  # note: empty list for modules falls back on all modules
@@ -170,15 +169,20 @@ class MQCRunDict(TypedDict):
 def get_general_stats(mqc_run_output: MQCRunDict) -> dict[str, dict]:
     returnDict = dict()
     report = mqc_run_output["report"]
-    mqc_modules = [list(header_entry.values())[0]['namespace'] for header_entry in report.general_stats_headers]
+    mqc_modules = [
+        list(header_entry.values())[0]["namespace"]
+        for header_entry in report.general_stats_headers
+    ]
     for mqc_module, single_module_data in zip(mqc_modules, report.general_stats_data):
         returnDict[mqc_module] = single_module_data
     return returnDict
 
 
-def format_plots_as_dataframe(mqc_rep: MQCRunDict) -> pd.DataFrame:
+def format_plots_as_dataframe(mqc_rep: MQCRunDict | dict) -> pd.DataFrame:
     log.info(f"Formatting to dataframe")
-    mqc_rep = mqc_rep["report"]
+    if getattr(mqc_rep, "report", False):
+        mqc_rep = mqc_rep["report"]
+
     # ingest plot data
     flat_plot_dict = format_plot_data(mqc_rep)
     # reformat to flatten list of dicts into single dict
@@ -278,9 +282,11 @@ def __parse_xy_line_graph_to_flat_dict(plot_data):
 
 
 def format_plot_data(mqc_rep: dict):
-    log.info(f"Attempting to extract data from {len(mqc_rep.plot_data)} plots")
+    if mqc_rep.get("report"):
+        mqc_rep = mqc_rep.get("report").plot_data
+    log.info(f"Attempting to extract data from {len(mqc_rep)} plots")
     all_clean_data = dict()
-    for plot_key, plot_data in mqc_rep.plot_data.items():
+    for plot_key, plot_data in mqc_rep.items():
         log.info(
             f"Attempting to extract data from plot with Title: {plot_data['config']['title']}"
         )
diff --git a/dp_tools/glds_api/commons.py b/dp_tools/glds_api/commons.py
index e3d8f12..d13655d 100644
--- a/dp_tools/glds_api/commons.py
+++ b/dp_tools/glds_api/commons.py
@@ -11,10 +11,10 @@
 
 log = logging.getLogger(__name__)
 
-GENELAB_DATASET_FILES = "https://genelab-data.ndc.nasa.gov/genelab/data/glds/files/{accession_number}"
+GENELAB_DATASET_FILES = "https://osdr.nasa.gov/genelab/data/glds/files/{accession_number}"
 """ Template URL to access json of files for a single GLDS accession ID """
 
-FILE_RETRIEVAL_URL_PREFIX = "https://genelab-data.ndc.nasa.gov{suffix}"
+FILE_RETRIEVAL_URL_PREFIX = "https://osdr.nasa.gov{suffix}"
 """ Used to retrieve files using remote url suffixes listed in the 'Data Query' API """
 
 @functools.cache
@@ -37,6 +37,7 @@ def get_table_of_files(accession: str) -> pd.DataFrame:
 
     # fetch data
     log.info(f"URL Source: {url}")
+    print(url)
     with urlopen(url) as response:
         data = yaml.safe_load(response.read())
         df = pd.DataFrame(data['studies'][accession_osd]['study_files'])
diff --git a/dp_tools/scripts/convert.py b/dp_tools/scripts/convert.py
index 6c0e744..99010f4 100644
--- a/dp_tools/scripts/convert.py
+++ b/dp_tools/scripts/convert.py
@@ -6,6 +6,7 @@
 from dp_tools.config import schemas
 from dp_tools.core.configuration import load_config
 from dp_tools.core.files import isa_archive
+from dp_tools.core.files.isa_archive import isa_investigation_subtables
 from dp_tools.glds_api.commons import retrieve_file_url
 from dp_tools import plugin_api
 
@@ -23,66 +24,6 @@ class BulkRNASeqMetadataComponent:
     pass
 
 
-# TODO: refactor this with the analogous metadata component method
-def isa_investigation_subtables(ISAarchive: Path) -> dict[str, pd.DataFrame]:
-    tables: dict[str, pd.DataFrame] = dict()
-
-    # track sub table lines
-    table_lines: List[list] = list()
-    key: str = None  # type: ignore
-
-    try:
-        [i_file] = (
-            f
-            for f in isa_archive.fetch_isa_files(ISAarchive)
-            if f.name.startswith("i_")
-        )
-    except ValueError:
-        raise FileNotFoundError(
-            f"Could not find an i_* file inside: {ISAarchive.name}, is this an ISA archive?"
-        )
-    with open(i_file, "r") as f:
-        for line in [l.rstrip() for l in f.readlines()]:
-            # search for header
-            if line in isa_archive.ISA_INVESTIGATION_HEADERS:
-                if key != None:
-                    tables[key] = pd.DataFrame(
-                        table_lines
-                    ).T  # each subtable is transposed in the i_file
-                    table_lines = list()
-                key = line  # set next table key
-            else:
-                tokens = line.split("\t")  # tab separated
-                table_lines.append(tokens)
-    tables[key] = pd.DataFrame(
-        table_lines
-    ).T  # each subtable is transposed in the i_file
-
-    # reformat each table
-    def clean_quotes(string: str) -> str:
-        SINGLE_OR_DOUBLE_QUOTES = "\"'"
-        # don't perform on non-string elements
-        if not isinstance(string, str):
-            return string
-        else:
-            return string.lstrip(SINGLE_OR_DOUBLE_QUOTES).rstrip(
-                SINGLE_OR_DOUBLE_QUOTES
-            )
-
-    df: pd.DataFrame
-    for key, df in tables.items():
-
-        # note: as a ref, no reassign needed
-        tables[key] = (
-            df.rename(columns=df.iloc[0]).drop(df.index[0]).applymap(clean_quotes)
-        )
-
-    # ensure all expected subtables present
-    assert set(tables.keys()) == isa_archive.ISA_INVESTIGATION_HEADERS
-
-    return tables
-
-
 def get_assay_table_path(
     ISAarchive: Path, configuration: dict, return_index: bool = False
 ) -> Path:
@@ -236,7 +177,7 @@ def get_column_name(df: pd.DataFrame, target: Union[str, list]) -> str:
 
 
 # TODO: Needs heavy refactoring and log messaging
-def isa_to_runsheet(accession: str, isaArchive: Path, config: Union[tuple[str, str], Path], inject: dict[str, str] = {}, schema: Union[DataFrameSchema, None] = None):
+def isa_to_runsheet(accession: str, isaArchive: Path, config: Union[tuple[str, str], Path], inject: dict[str, str] = {}, schema: Union[DataFrameSchema, None] = None, assert_factor_values: bool = True):
     ################################################################
     ################################################################
     # SETUP CONFIG AND INPUT TABLES
@@ -311,6 +252,7 @@ def isa_to_runsheet(accession: str, isaArchive: Path, config: Union[tuple[str, s
     ]
     for entry in assay_source_entries:
         assert list(df_final.index) == list(df_merged.index)
+        use_fallback_value = False
         if entry.get("Runsheet Index"):
             # already set and checked above
             continue
@@ -403,8 +345,17 @@ def isa_to_runsheet(accession: str, isaArchive: Path, config: Union[tuple[str, s
                             index=df_merged.index,
                         )
                 else:
-                    target_col = get_column_name(df_merged, entry["ISA Field Name"])
-                    series_to_add = df_merged[target_col]
+                    try:
+                        target_col = get_column_name(df_merged, entry["ISA Field Name"])
+                        series_to_add = df_merged[target_col]
+                    except ValueError as e: # Raised when a column is not present
+                        if entry.get("Fallback Value"):
+                            # Create series of same row length as df_merged
+                           series_to_add = pd.Series([entry.get("Fallback Value") for _ in range(len(df_merged))])
+                           use_fallback_value = True
+                           log.warn(f"Could not find column: {entry['ISA Field Name']}. Using configured fallback value: {entry.get('Fallback Value')}")
+                        else:
+                            raise(e)
                 if entry.get("GLDS URL Mapping"):
                     def map_url_to_filename(fn: str) -> str:
                         try:
@@ -418,7 +369,9 @@ def map_url_to_filename(fn: str) -> str:
                         map_url_to_filename
                     )  # inplace operation doesn't seem to work
                     series_to_add = _swap
-                if entry.get("Remapping"):
+                if use_fallback_value:
+                    df_final[entry["Runsheet Column Name"]] = entry["Fallback Value"]
+                elif entry.get("Remapping"):
                     df_final[entry["Runsheet Column Name"]] = series_to_add.map(
                         lambda val: entry.get("Remapping")[val]
                     )
@@ -463,10 +416,11 @@ def map_url_to_filename(fn: str) -> str:
 
     runsheet_schema.validate(df_final)
 
-    # ensure at least on Factor Value is extracted
-    assert (
-        len([col for col in df_final.columns if col.startswith("Factor Value[")]) != 0
-    ), f"Must extract at least one factor value column but only has the following columns: {df_final.columns}"
+    if assert_factor_values:
+        # ensure at least on Factor Value is extracted
+        assert (
+            len([col for col in df_final.columns if col.startswith("Factor Value[")]) != 0
+        ), f"Must extract at least one factor value column but only has the following columns: {df_final.columns}"
 
     ################################################################
     ################################################################
diff --git a/dp_tools/scripts/vv_interface.py b/dp_tools/scripts/vv_interface.py
index e6c2a19..94ac7dd 100644
--- a/dp_tools/scripts/vv_interface.py
+++ b/dp_tools/scripts/vv_interface.py
@@ -9,6 +9,7 @@
 from dp_tools.core.loaders import load_data
 from dp_tools.core.check_model import ValidationProtocol, FlagCode, run_manual_check
 
+
 @click.group()
 def cli():
     pass
@@ -18,30 +19,73 @@ def cli():
 def validation():
     pass
 
+
 cli.add_command(validation)
 
+
 @click.command()
-@click.option('--output', default="VV_report.tsv", help="Name of report output file", show_default=True)
-@click.argument('plug_in_dir')
-@click.argument('data_dir')
-@click.argument('runsheet_path')
-def run(plug_in_dir, output, data_dir, runsheet_path):
+@click.option(
+    "--output",
+    default="VV_report.tsv",
+    help="Name of report output file",
+    show_default=True,
+)
+@click.argument("plug_in_dir")
+@click.argument("data_dir")
+@click.argument("runsheet_path")
+@click.option(
+    "--data-asset-key-sets",
+    type=click.STRING,
+    default=None,
+    help="Name of data asset key sets to use. Defaults to use all data asset keys in configuration file.",
+    show_default=True,
+)
+@click.option(
+    "--run-components",
+    type=click.STRING,
+    default=None,
+    help="Name of components to run. Defaults to use all components.",
+    show_default=True,
+)
+@click.option(
+    "--max-flag-code",
+    type=click.INT,
+    default=FlagCode.HALT.value,
+    help="Maximum flag code. If this is exceeded by any check, an error will be raised. Defaults to level associated with FlagCode.HALT.",
+    show_default=True,
+)
+def run(
+    plug_in_dir,
+    output,
+    data_dir,
+    runsheet_path,
+    data_asset_key_sets,
+    run_components,
+    max_flag_code,
+):
     plugin = load_plugin(Path(plug_in_dir))
     output = Path(output)
     data_dir = Path(data_dir)
     runsheet_path = Path(runsheet_path)
-    click.echo(f"Running validation protocol and outputting report to file: '{output}'")\
-    
+    click.echo(f"Running validation protocol and outputting report to file: '{output}'")
     datasystem = load_data(
         config=plugin.config,
         root_path=data_dir,
         runsheet_path=runsheet_path,
+        key_sets=data_asset_key_sets.split(",")
+        if data_asset_key_sets is not None
+        else None,
     )
 
     vp = plugin.protocol.validate(
         datasystem.dataset,
         report_args={"include_skipped": True},
         defer_run=True,
+        protocol_args={
+            "run_components": run_components.split(",")
+            if run_components is not None
+            else None
+        },
     )
 
     vp.run()
@@ -64,8 +108,17 @@ def run(plug_in_dir, output, data_dir, runsheet_path):
     df.to_csv(output, sep="\t")
     click.echo(f"Writing results to '{output}'")
 
+    # Raise error if any flag code exceeds max_flag_code
+    flagged_messages = "\n".join(
+        [msg for msg in df.loc[df["code_level"] >= max_flag_code]["message"]]
+    )
+    assert (
+        df["code_level"].max() < max_flag_code
+    ), f"Maximum flag code exceeded: {max_flag_code}. Printing flag messages that caused this halt: {flagged_messages}"
+
+
 @click.command()
-@click.argument('validation_report')
+@click.argument("validation_report")
 def manual_checks(validation_report):
     click.echo(f"Reviewing pending manual checks")
     validation_report = Path(validation_report)
@@ -75,10 +128,12 @@ def manual_checks(validation_report):
 
     manual_checks_count = 0
     for _, row in df.iterrows():
-        if int(row['code_level']) == FlagCode.MANUAL.value:
+        if int(row["code_level"]) == FlagCode.MANUAL.value:
             manual_checks_count += 1
 
-    click.echo(f"Found {manual_checks_count} manual checks pending... Starting manual review")
+    click.echo(
+        f"Found {manual_checks_count} manual checks pending... Starting manual review"
+    )
 
     analyst_id = False
     while not analyst_id:
@@ -88,67 +143,101 @@ def manual_checks(validation_report):
     for _, row in df.iterrows():
         logger.debug(f"Processsing: {row}")
         # Pass through if not manual
-        if int(row['code_level']) != FlagCode.MANUAL.value:
+        if int(row["code_level"]) != FlagCode.MANUAL.value:
             new_rows.append(dict(row))
         else:
             # Manual check
-            logger.debug(f"""Loading as json string: {row['kwargs'].replace("'",'"')}""")
-            result = run_manual_check(**json.loads(row['kwargs'].replace("'",'"')))
+            logger.debug(
+                f"""Loading as json string: {row['kwargs'].replace("'",'"')}"""
+            )
+            result = run_manual_check(**json.loads(row["kwargs"].replace("'", '"')))
             # replace original manual check notice with filled results
             row = dict(row) | result
-            row['kwargs'] = str(json.loads(row['kwargs'].replace("'",'"')) | {"analyst_ID": analyst_id})
+            row["kwargs"] = str(
+                json.loads(row["kwargs"].replace("'", '"')) | {"analyst_ID": analyst_id}
+            )
             new_rows.append(row)
-    
+
     new_df = pd.DataFrame(new_rows)
 
     click.echo("Completed manual checks")
 
     output = validation_report.with_suffix("")
-    new_df.to_csv(output, index = False, sep = "\t") # Remove ".PENDING_MANUAL_CHECKS"
+    new_df.to_csv(output, index=False, sep="\t")  # Remove ".PENDING_MANUAL_CHECKS"
 
     click.echo(f"Wrote complete report to '{output}'")
 
 
 @click.command()
-@click.option('--output', default="protocol_spec.txt", help="Name of specification output file", show_default=True)
-@click.argument('plug_in_dir')
-@click.argument('data_dir')
-@click.argument('runsheet_path')
-def spec(plug_in_dir, output, data_dir, runsheet_path):
+@click.option(
+    "--output",
+    default="protocol_spec.txt",
+    help="Name of specification output file",
+    show_default=True,
+)
+@click.argument("plug_in_dir")
+@click.argument("data_dir")
+@click.argument("runsheet_path")
+@click.option(
+    "--data-asset-key-sets",
+    type=click.STRING,
+    default=None,
+    help="Name of data asset key sets to use. Defaults to use all data asset keys in configuration file.",
+    show_default=True,
+)
+@click.option(
+    "--run-components",
+    type=click.STRING,
+    default=None,
+    help="Name of components to run. Defaults to use all components.",
+    show_default=True,
+)
+def spec(
+    plug_in_dir, output, data_dir, runsheet_path, data_asset_key_sets, run_components
+):
     plugin = load_plugin(Path(plug_in_dir))
     output = Path(output)
     data_dir = Path(data_dir)
     runsheet_path = Path(runsheet_path)
-    click.echo(f"Generating specification of validation protocol and outputting to file: '{output}'")\
-    
+    click.echo(
+        f"Generating specification of validation protocol and outputting to file: '{output}'"
+    )
     datasystem = load_data(
         config=plugin.config,
         root_path=data_dir,
         runsheet_path=runsheet_path,
+        key_sets=data_asset_key_sets.split(",")
+        if data_asset_key_sets is not None
+        else None,
     )
 
     vp = plugin.protocol.validate(
         datasystem.dataset,
         report_args={"include_skipped": True},
         defer_run=True,
+        protocol_args={
+            "run_components": run_components.split(",")
+            if run_components is not None
+            else None
+        },
     )
 
     specification = vp.queued_checks(
-                long_description = True,
-                CHECK_PREFIX = "",
-                COMPONENT_PREFIX = " ",
-                INDENT_CHAR = "#",
-                WRAP_COMPONENT_NAME_CHAR = "",
-                include_checks_counters = False,
-                include_manual_checks = True
-                )
+        long_description=True,
+        CHECK_PREFIX="",
+        COMPONENT_PREFIX=" ",
+        INDENT_CHAR="#",
+        WRAP_COMPONENT_NAME_CHAR="",
+        include_checks_counters=False,
+        include_manual_checks=True,
+    )
 
     with open(output, "w") as f:
         f.write(specification)
 
     click.echo(f"Saved specification to {output}")
 
+
 validation.add_command(run)
 validation.add_command(manual_checks)
 validation.add_command(spec)
-
diff --git a/extract_dataset.py b/extract_dataset.py
new file mode 100644
index 0000000..a6860b8
--- /dev/null
+++ b/extract_dataset.py
@@ -0,0 +1,36 @@
+from pathlib import Path
+
+import click
+
+from dp_tools.core.utilites.metrics_extractor import (
+    generate_extractor_from_yaml_config,
+    AssayType,
+)
+
+
+CONFIG_YAML = "extraction_settings.yaml"
+ISA_PARSE_PATH = "isa_config.yaml"
+
+@click.command()
+@click.option("--osd-id", help='OSD Accession ID. e.g. "OSD-194"', required=True)
+def main(osd_id):
+    ISA_PATH = list(Path.cwd().glob("*ISA*.zip"))[0]
+
+    metricsExtractor = generate_extractor_from_yaml_config(config=CONFIG_YAML)
+
+    metricsExtractor.extract_data_from_isa(
+        accession=osd_id,
+        isa_archive=ISA_PATH,
+        config=Path(ISA_PARSE_PATH),
+    )
+
+    # metricsExtractor.append_manual_yaml_data(target_yaml=test_yaml)
+
+    metricsExtractor.extract_sections()
+
+    metricsExtractor.metrics.to_csv(f"{osd_id}_metrics.csv")
+
+    metricsExtractor.process_metrics(assay_type=AssayType.bulkRNASeq).to_csv(f"{osd_id}_summary.csv")
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/set_up_config_files.sh b/set_up_config_files.sh
new file mode 100644
index 0000000..b93b21c
--- /dev/null
+++ b/set_up_config_files.sh
@@ -0,0 +1,8 @@
+set -eux
+
+OUT_DIR=$1
+
+cp assets/isa_config.yaml ${OUT_DIR}
+cp assets/extraction_conf.yaml ${OUT_DIR}/extraction_settings.yaml
+
+# cp tests/assets/test.yaml ${OUT_DIR} # Disabled as test.yaml which contains user specified columns is not fully implemented
diff --git a/setup.py b/setup.py
index 7251c99..021a9fa 100644
--- a/setup.py
+++ b/setup.py
@@ -12,7 +12,7 @@
 
 setup(
     name="dp_tools",
-    version="1.3.0",
+    version="1.3.4",
     description="Tooling for Data Processing Operations",
     author="Jonathan Oribello",
     author_email="jonathan.d.oribello@gmail.com",
@@ -23,7 +23,17 @@
     },
     include_package_data=True,
     python_requires=">=3.10",
-    install_requires=["requests", "pyyaml", "pandas==1.4.4", "schema", "tabulate", "multiqc", "pandera", "click", "loguru"],
+    install_requires=[
+        "requests",
+        "pyyaml",
+        "pandas==1.4.4",
+        "schema",
+        "tabulate",
+        "multiqc",
+        "pandera",
+        "click",
+        "loguru",
+    ],
     setup_requires=[],
     tests_require=["pytest", "pytest-console_scripts"],
     entry_points={
diff --git a/tests/assets/config.yaml b/tests/assets/config.yaml
new file mode 100644
index 0000000..ad70f8f
--- /dev/null
+++ b/tests/assets/config.yaml
@@ -0,0 +1,129 @@
+Extraction Settings:
+  root search directory: "/CHANGEME/TO/WHERE/MQC/ARE"
+  sections:
+    - name: "raw reads" 
+      enabled: True
+      multiQC:
+        from json:
+          - "raw_multiqc_report"
+          - "raw_multiqc_data"
+          - "multiqc_data.json"
+        search recursively: False
+        logs directory: 
+          - "00-RawData"
+          - "FastQC_Reports"
+        logs pattern(s):
+          - "*fastqc.zip"
+        modules:
+          - "fastqc"
+
+    - name: "trimmed reads"
+      enabled: True
+      multiQC:
+        from json:
+          - "trimmed_multiqc_report"
+          - "trimmed_multiqc_data"
+          - "multiqc_data.json"
+        search recursively: False
+        logs directory: 
+          - "01-TG_Preproc"
+          - "FastQC_Reports"
+        logs pattern(s):
+          - "*fastqc.zip"
+        modules:
+          - "fastqc"
+
+    - name: "aligned reads"
+      enabled: True
+      multiQC:
+        from json:
+          - "align_multiqc_report"
+          - "align_multiqc_data"
+          - "multiqc_data.json"
+        search recursively: True
+        logs directory: 
+          - "02-STAR_Alignment"
+        logs pattern(s):
+          - "*Log.final.out"
+        modules:
+          - "star"
+
+    - name: "rseqc: genebody coverage"
+      enabled: True
+      multiQC:
+        from json:
+          - "geneBody_cov_multiqc_report"
+          - "geneBody_cov_multiqc_data"
+          - "multiqc_data.json"        
+        search recursively: True
+        logs directory: 
+          - "RSeQC_Analyses"
+          - "02_geneBody_coverage"
+        logs pattern(s):
+          - "*.geneBodyCoverage.txt"
+        modules:
+          - "rseqc"
+
+    - name: "rseqc: infer experiment"
+      enabled: True
+      multiQC:
+        from json:
+          - "infer_exp_multiqc_report"
+          - "infer_exp_multiqc_data"
+          - "multiqc_data.json"
+        search recursively: True
+        logs directory: 
+          - "RSeQC_Analyses"
+          - "03_infer_experiment"    
+        logs pattern(s):
+          - "*infer_expt.out"
+        modules:
+          - "rseqc"
+
+    - name: "rseqc: inner distance"
+      enabled: True
+      multiQC:
+        from json:
+          - "inner_dist_multiqc_report"
+          - "inner_dist_multiqc_data"
+          - "multiqc_data.json"   
+        search recursively: True
+        logs directory: 
+          - "RSeQC_Analyses"
+          - "04_inner_distance"
+        logs pattern(s):
+          - "*inner_distance.txt"
+        modules:
+          - "rseqc"
+
+    - name: "rseqc: read distribution"
+      enabled: True
+      multiQC:
+        from json:
+          - "read_dist_multiqc_report"
+          - "read_dist_multiqc_data"
+          - "multiqc_data.json"   
+        search recursively: True
+        logs directory: 
+          - "RSeQC_Analyses"
+          - "05_read_distribution"
+        logs pattern(s):
+          - "*read_dist.out"
+        modules:
+          - "rseqc"
+
+
+    - name: "rsem count"
+      enabled: True 
+      multiQC:
+        from json:
+          - "RSEM_count_multiqc_report"
+          - "RSEM_count_multiqc_data"
+          - "multiqc_data.json"   
+        search recursively: True
+        logs directory: 
+          - "03-RSEM_Counts"
+        logs pattern(s):
+          - "*.stat"
+        modules:
+          - "rsem"
diff --git a/tests/assets/isa_config.yaml b/tests/assets/isa_config.yaml
new file mode 100644
index 0000000..f342e52
--- /dev/null
+++ b/tests/assets/isa_config.yaml
@@ -0,0 +1,1321 @@
+# TOP LEVEL
+NAME: "bulkRNASeq"
+VERSION: "1"
+
+# anchors for reuse
+_anchors:
+  rawDataDir: &rawDataDir "00-RawData"
+  trimDataDir: &trimDataDir "01-TG_Preproc"
+  alignDataDir: &alignDataDir "02-STAR_Alignment"
+  countsDataDir: &countsDataDir "03-RSEM_Counts"
+  normCountsDataDir: &normCountsDataDir "04-DESeq2_NormCounts"
+  DGEDataDir: &DGEDataDir "05-DESeq2_DGE"
+  rseqcDataDir: &rseqcDataDir "RSeQC_Analyses" # DISCUSS: Should this be renamed to "RSeQC_Analyses" for consistent casing? -J.O. , this has been renamed and differs from the recent bash based processings
+  ERCCAnalysisDir: &ERCCAnalysisDir "ERCC_Analysis"
+  FastQC_Reports: &FastQC_Reports "FastQC_Reports"
+  neverPublished: &neverPublished
+    subcategory: null
+    subdirectory: null
+    publish to repo: false
+    include subdirectory in table: false
+    table order: -1
+
+Staging:
+  General:
+    Required Metadata:
+      From ISA:
+        - ISA Field Name: Study Protocol Type
+          ISA Table Source: Investigation
+          Investigation Subtable: STUDY PROTOCOLS
+          # will return a boolean indicating if any of the following includes
+          True If Includes At Least One:
+            - spike-in quality control role
+            - spike-in protocol
+            - spike-in control
+            - spike-in control protocol
+          Runsheet Column Name: has_ERCC
+          Processing Usage: >-
+            Indicates is ERCC spike-in has been added. This can be automatically
+            determined from the ISA archive as well based on 'Study Protocol Name' and 'Study Protocol Type'
+          Example: 'TRUE'
+
+        - ISA Field Name: 
+            - Characteristics[Organism]
+            - Characteristics[organism]
+          ISA Table Source: Sample
+          Runsheet Column Name: organism
+          Processing Usage: >-
+            Mapping to the appropriate alignment reference and annotation databases.
+          Example: Arabidopsis thaliana
+
+        - ISA Field Name: 
+            - Characteristics[Material Type]
+            # - Characteristics[organism]
+          ISA Table Source: Assay
+          Runsheet Column Name: Tissue Type
+          Processing Usage: >-
+            Used for metrics table
+          Example: Left retina
+
+        - ISA Field Name: 
+            - Parameter Value[library selection]
+            # - Characteristics[organism]
+          ISA Table Source: Sample
+          Runsheet Column Name: Library Prep Method
+          Processing Usage: >-
+            Used for metrics table
+          Example: Left retina
+
+        - ISA Field Name: Sample Name
+          ISA Table Source: Assay
+          Runsheet Column Name: sample_name
+          Runsheet Index: true
+          Processing Usage: >-
+            Sample name is used as a unique sample identifier during processing
+          Example: Atha_Col-0_Root_WT_Ctrl_45min_Rep1_GSM502538
+    
+        - ISA Field Name: 
+            - Parameter Value[library layout]
+            - Parameter Value[Library Layout]
+          ISA Table Source: Assay
+          Runsheet Column Name: PE or SE
+          Remapping: {"PAIRED":'PE', "Paired":'PE', "SINGLE":'SE'}
+          Processing Usage: >-
+            Used for metrics table
+          Example: 'PE'
+    
+        - ISA Field Name: 
+            - Parameter Value[Stranded]
+            - Parameter Value[stranded]
+          ISA Table Source: Assay
+          Runsheet Column Name: Stranded or Unstranded
+          # Remapping: {"PAIRED":'PE', "Paired":'PE', "SINGLE":'SE'}
+          Processing Usage: >-
+            Used for metrics table
+          Example: 'STRANDED'
+    
+        - ISA Field Name: 
+            - Parameter Value[rRNA Contamination]
+            # - Parameter Value[Library Layout]
+          ISA Table Source: Assay
+          Runsheet Column Name: '% rRNA contamination'
+          # Append Column Following: "Unit"
+          Processing Usage: >-
+            Used for metrics table
+          Example: '13.212 percent'
+    
+        # this entry denotes the following:
+        # retrive from that ISA field name
+        # multiple values (separated by ",")
+        # index those to certain runsheet columns
+        # if the index doesn't exist, optional prevents raising an exception
+        # GLDS URL Mapping means the names are searched against the GLDS filelisting json for urls
+        # an exception will be raised if one and only one url is not mapped to each filename
+        # - ISA Field Name: 
+        #     - Parameter Value[Merged Sequence Data File]
+        #     - Characteristics[Merged Sequence Data File]
+        #     - Raw Data File
+        #   ISA Table Source: Assay
+        #   Multiple Values Per Entry: true
+        #   Multiple Values Delimiter: '\s*,\s*' # whitespace surrounded comma
+        #   Runsheet Column Name: 
+        #     - {'name':'read1_path', 'index':0}
+        #     - {'name':'read2_path', 'index':1, 'optional':true}
+        #   GLDS URL Mapping: true
+        #   Processing Usage: >-
+        #     Location to the raw data fastq file. May be a url or local path.
+        #   Example: 'https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-194_rna...'
+
+        # - ISA Field Name: Factor Value[{factor_name}]
+        #   ISA Table Source: [Assay, Sample]
+        #   Runsheet Column Name: Factor Value[{factor_name}]
+        #   Matches Multiple Columns: true
+        #   Match Regex: "Factor Value\\[.*\\]"
+        #   Append Column Following: "Unit"
+        #   Processing Usage: >-
+        #     Factor values in a study. Used to assign experimental groups for each sample.
+        #     Note: On the runsheet, a subsequent 'Unit' Column value will be 
+        #     suffix-concatenated if it exists.
+        #   Example: Basal Control
+    
+        - ISA Field Name: Unit
+          ISA Table Source: [Assay, Sample]
+          Runsheet Column Name: null
+          Matches Multiple Columns: true
+          Autoload: false # handled by factor value loading above
+          Processing Usage: >-
+            Unit to be suffix-concatenated onto prior Factor value columns.
+          Example: day
+
+      From User:
+        # Removed since unused by Processing via the runsheet
+        # - Runsheet Column Name: GLDS
+        #   Processing Usage: >-
+        #     The GLDS accession number
+        #   Example: GLDS-205
+  
+        - Runsheet Column Name: read1_path
+          # used to generate candidate file names for searching GLDS repository filelisting
+          Data Asset Keys: ["raw forward reads fastq GZ", "raw reads fastq GZ"]
+          Processing Usage: >-
+            The location of either the forward reads (paired end) or only reads file (single end) 
+            raw fastq file. Can be either a url or local path.
+            Note: For GLDS raw data assets, either the filelisting json API or the OpenAPI
+            may be used to retrieve urls given the array data filename (sourced from ISA archive).
+          Example: /some/local/path OR https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-123_microarray_E-MTAB-3289.raw.1.zip?version=1
+    
+  
+        - Runsheet Column Name: read2_path
+          Data Asset Keys: ["raw reverse reads fastq GZ"]
+          Processing Usage: >-
+            The location of either the reverse reads (paired end)
+            raw fastq file. Can be either a url or local path.
+            For single end studies, this should be an empty string.
+            Note: For GLDS raw data assets, either the filelisting json API or the OpenAPI
+            may be used to retrieve urls given the array data filename (sourced from ISA archive).
+          Example: /some/local/path OR https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-123_microarray_E-MTAB-3289.raw.1.zip?version=1
+
+ISA Meta:
+  Valid Study Assay Technology And Measurement Types:
+    - measurement: "transcription profiling"
+      technology: "RNA Sequencing (RNA-Seq)"
+
+  # this is prepended to all file names in the curation assay table
+  Global file prefix: "{datasystem}_rna_seq_"
+
+  # configuration related to updating investigation file
+  # each must refer to a STUDY PROCESS in the 'ISA_investigation.yaml' file
+  # LEADCAP_organism should be the studied organisms scientific name with a leading cap
+  Post Processing Add Study Protocol: 
+    GeneLab RNAseq data processing protocol::{LEADCAP_organism} V1
+
+data assets:
+  runsheet:
+    processed location: 
+      - "Metadata"
+      - "{dataset}_bulkRNASeq_v1_runsheet.csv"
+
+    tags:
+      - raw
+
+    resource categories: *neverPublished
+
+  ISA Archive:
+    processed location: 
+      - "Metadata"
+      - "*-ISA.zip"
+
+    tags:
+      - raw
+
+    resource categories: *neverPublished
+
+  raw MultiQC directory:
+    processed location: 
+      - *rawDataDir
+      - *FastQC_Reports
+      - "raw_multiqc_report"
+
+    tags:
+      - raw
+
+    resource categories: *neverPublished
+
+  raw MultiQC directory ZIP:
+    processed location: 
+      - *rawDataDir
+      - *FastQC_Reports
+      - "raw_multiqc_report.zip"
+    
+    tags:
+      - raw
+
+    resource categories: &MergedSequenceData_MultiQCReports
+      subcategory: Merged Sequence Data
+      subdirectory: Multiqc Reports
+      publish to repo: true
+      include subdirectory in table: true
+      table order: 1
+
+  raw forward reads fastq GZ:
+    processed location:
+      - *rawDataDir
+      - "Fastq"
+      - "{sample}_R1_raw.fastq.gz"
+
+    tags:
+      - raw
+
+    resource categories: &MergedSequenceData_Fastq
+      subcategory: Merged Sequence Data
+      subdirectory: Fastq
+      publish to repo: true
+      include subdirectory in table: false
+      table order: 0
+
+  raw reverse reads fastq GZ:
+    processed location:
+      - *rawDataDir
+      - "Fastq"
+      - "{sample}_R2_raw.fastq.gz"
+
+    tags:
+      - raw
+      
+    resource categories: *MergedSequenceData_Fastq
+
+  raw reads fastq GZ:
+    processed location:
+      - *rawDataDir
+      - "Fastq"
+      - "{sample}_raw.fastq.gz"
+
+    tags:
+      - raw
+      
+    resource categories: *MergedSequenceData_Fastq
+
+  raw forward reads fastQC HTML:
+    processed location:
+      - *rawDataDir
+      - *FastQC_Reports
+      - "{sample}_R1_raw_fastqc.html"
+
+    tags:
+      - raw
+      
+    resource categories: *neverPublished
+
+  # J.Oribello: We should revisit this, fastQC includes some unique (not parsed
+  # into multiQC) relevant information like the actual overrepresented sequence strings
+  raw reverse reads fastQC HTML:
+    processed location:
+      - *rawDataDir
+      - *FastQC_Reports
+      - "{sample}_R2_raw_fastqc.html"
+
+    tags:
+      - raw
+      
+    resource categories: *neverPublished
+
+  raw reads fastQC HTML:
+    processed location:
+      - *rawDataDir
+      - *FastQC_Reports
+      - "{sample}_raw_fastqc.html"
+
+    tags:
+      - raw
+      
+    resource categories: *neverPublished
+
+  raw forward reads fastQC ZIP: 
+    processed location:
+      - *rawDataDir
+      - *FastQC_Reports
+      - "{sample}_R1_raw_fastqc.zip"
+
+    tags:
+      - raw
+      
+    resource categories: *neverPublished
+
+  raw reverse reads fastQC ZIP:
+    processed location:
+      - *rawDataDir
+      - *FastQC_Reports
+      - "{sample}_R2_raw_fastqc.zip"
+
+    tags:
+      - raw
+      
+    resource categories: *neverPublished
+
+  raw reads fastQC ZIP:
+    processed location:
+      - *rawDataDir
+      - *FastQC_Reports
+      - "{sample}_raw_fastqc.zip"
+
+    tags:
+      - raw
+      
+    resource categories: *neverPublished
+
+  trimmed fastQC MultiQC directory:
+    processed location: 
+      - *trimDataDir
+      - *FastQC_Reports
+      - "trimmed_multiqc_report"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  trimmed fastQC MultiQC directory ZIP:
+    processed location: 
+      - *trimDataDir
+      - *FastQC_Reports
+      - "trimmed_multiqc_report.zip"
+
+    tags:
+      - processed
+      
+    resource categories: &TrimmedSequenceData_MultiQCReports
+      subcategory: Trimmed Sequence Data
+      subdirectory: Multiqc Reports
+      publish to repo: true
+      include subdirectory in table: true
+      table order: 4
+
+  trimmed forward reads fastq GZ: &trimmedFastqGZ
+    processed location:
+      - *trimDataDir
+      - "Fastq"
+      - "{sample}_R1_trimmed.fastq.gz"
+
+    tags:
+      - processed
+      
+    resource categories:
+      subcategory: Trimmed Sequence Data
+      subdirectory: Fastq
+      publish to repo: true
+      include subdirectory in table: false
+      table order: 3
+
+  trimmed reverse reads fastq GZ:
+    <<: *trimmedFastqGZ
+    processed location:
+      - *trimDataDir
+      - "Fastq"
+      - "{sample}_R2_trimmed.fastq.gz"
+
+    tags:
+      - processed
+      
+  trimmed reads fastq GZ:
+    <<: *trimmedFastqGZ
+    processed location:
+      - *trimDataDir
+      - "Fastq"
+      - "{sample}_trimmed.fastq.gz"
+
+    tags:
+      - processed
+      
+  trimmed forward reads fastQC HTML: &trimmedForwardReadsFastQCHTML
+    processed location:
+      - *trimDataDir
+      - *FastQC_Reports
+      - "{sample}_R1_trimmed_fastqc.html"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  trimmed reverse reads fastQC HTML: 
+    <<: *trimmedForwardReadsFastQCHTML
+    processed location:
+      - *trimDataDir
+      - *FastQC_Reports
+      - "{sample}_R2_trimmed_fastqc.html"
+
+    tags:
+      - processed
+      
+  trimmed reads fastQC HTML: 
+    <<: *trimmedForwardReadsFastQCHTML
+    processed location:
+      - *trimDataDir
+      - *FastQC_Reports
+      - "{sample}_trimmed_fastqc.html"
+
+    tags:
+      - processed
+      
+  trimmed forward reads fastQC ZIP: &trimmedForwardReadsFastQCZIP
+    processed location:
+      - *trimDataDir
+      - *FastQC_Reports
+      - "{sample}_R1_trimmed_fastqc.zip"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  trimmed reverse reads fastQC ZIP: 
+    <<: *trimmedForwardReadsFastQCZIP
+    processed location:
+      - *trimDataDir
+      - *FastQC_Reports
+      - "{sample}_R2_trimmed_fastqc.zip"
+
+    tags:
+      - processed
+      
+  trimmed reads fastQC ZIP: 
+    <<: *trimmedForwardReadsFastQCZIP
+    processed location:
+      - *trimDataDir
+      - *FastQC_Reports
+      - "{sample}_trimmed_fastqc.zip"
+
+    tags:
+      - processed
+      
+  trimming MultiQC directory:
+    processed location: 
+      - *trimDataDir
+      - &trimmingReportsDir "Trimming_Reports"
+      - "trimming_multiqc_report"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  forward reads trimming report: &trimmedForwardReadsFastQCTrimmingReport
+    processed location:
+      - *trimDataDir
+      - *trimmingReportsDir
+      - "{sample}_R1_raw.fastq.gz_trimming_report.txt"
+
+    tags:
+      - processed
+      
+    resource categories: 
+      subcategory: Trimmed Sequence Data
+      subdirectory: Trimming Reports
+      publish to repo: true
+      include subdirectory in table: true
+      table order: 5
+
+  reverse reads trimming report: 
+    <<: *trimmedForwardReadsFastQCTrimmingReport
+    processed location:
+      - *trimDataDir
+      - *trimmingReportsDir
+      - "{sample}_R2_raw.fastq.gz_trimming_report.txt"
+
+    tags:
+      - processed
+      
+  reads trimming report: 
+    <<: *trimmedForwardReadsFastQCTrimmingReport
+    processed location:
+      - *trimDataDir
+      - *trimmingReportsDir
+      - "{sample}_raw.fastq.gz_trimming_report.txt"
+
+    tags:
+      - processed
+      
+  aligned MultiQC directory:
+    processed location: 
+      - *alignDataDir
+      - "align_multiqc_report"
+
+    resource categories: *neverPublished
+
+    tags:
+      - processed
+      
+  aligned MultiQC directory ZIP:
+    processed location: 
+      - *alignDataDir
+      - "align_multiqc_report.zip"
+
+    tags:
+      - processed
+      
+    resource categories: &AlignedSequenceData_MultiQCReports
+      subcategory: Aligned Sequence Data # RENAME: from 'Aligned sequence data'. For consistency with Title casing across the board
+      subdirectory: MultiQC Reports # RENAME: from 'MultiQC Reports'. For consistency with Title casing across the board
+      publish to repo: true
+      include subdirectory in table: true
+      table order: 8
+
+  aligned ToTranscriptome Bam:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_Aligned.toTranscriptome.out.bam"
+
+    tags:
+      - processed
+      
+    resource categories: &AlignedSequenceData_AlignedData
+      subcategory: Aligned Sequence Data
+      subdirectory: Aligned Data
+      publish to repo: true
+      include subdirectory in table: false
+      table order: 6
+
+  aligned SortedByCoord Bam:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_Aligned.sortedByCoord.out.bam"
+
+    tags:
+      - processed
+      
+    resource categories:  *neverPublished
+
+  aligned SortedByCoord ResortedBam:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_Aligned.sortedByCoord_sorted.out.bam"
+
+    tags:
+      - processed
+      
+    resource categories: *AlignedSequenceData_AlignedData
+
+  aligned SortedByCoord ResortedBamIndex:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_Aligned.sortedByCoord_sorted.out.bam.bai"
+
+    tags:
+      - processed
+      
+    resource categories: *AlignedSequenceData_AlignedData
+
+  aligned log Final:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_Log.final.out"
+
+    tags:
+      - processed
+      
+    resource categories: &AlignedSequenceData_AlignmentLogs
+      subcategory: Aligned Sequence Data
+      subdirectory: Alignment Logs
+      publish to repo: true
+      include subdirectory in table: true
+      table order: 7
+
+  aligned log Progress:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_Log.progress.out"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  aligned log Full:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_Log.out"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  aligned sjTab:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_SJ.out.tab"
+
+    tags:
+      - processed
+      
+    resource categories: *AlignedSequenceData_AlignedData
+
+  genebody coverage MultiQC directory:
+    processed location: 
+      - *rseqcDataDir
+      - "02_geneBody_coverage"
+      - "geneBody_cov_multiqc_report"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  genebody coverage MultiQC directory ZIP:
+    processed location: 
+      - *rseqcDataDir
+      - "02_geneBody_coverage"
+      - "geneBody_cov_multiqc_report.zip"
+
+    tags:
+      - processed
+      
+    resource categories: &RSeQC_MultiQCReports
+      subcategory: RSeQC
+      subdirectory: MultiQC Reports
+      publish to repo: true
+      include subdirectory in table: true
+      table order: 9
+
+  infer experiment MultiQC directory:
+    processed location: 
+      - *rseqcDataDir
+      - "03_infer_experiment"
+      - "infer_exp_multiqc_report"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  infer experiment MultiQC directory ZIP:
+    processed location: 
+      - *rseqcDataDir
+      - "03_infer_experiment"
+      - "infer_exp_multiqc_report.zip"
+
+    tags:
+      - processed
+      
+    resource categories: *RSeQC_MultiQCReports
+
+  inner distance MultiQC directory:
+    processed location: 
+      - *rseqcDataDir
+      - "04_inner_distance"
+      - "inner_dist_multiqc_report"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  inner distance MultiQC directory ZIP:
+    processed location: 
+      - *rseqcDataDir
+      - "04_inner_distance"
+      - "inner_dist_multiqc_report.zip"
+
+    tags:
+      - processed
+      
+    resource categories: *RSeQC_MultiQCReports
+
+  read distribution MultiQC directory:
+    processed location: 
+      - *rseqcDataDir
+      - "05_read_distribution"
+      - "read_dist_multiqc_report"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  read distribution MultiQC directory ZIP:
+    processed location: 
+      - *rseqcDataDir
+      - "05_read_distribution"
+      - "read_dist_multiqc_report.zip"
+
+    tags:
+      - processed
+      
+    resource categories: *RSeQC_MultiQCReports
+
+  genebody coverage out:
+    processed location: 
+      - *rseqcDataDir
+      - "02_geneBody_coverage"
+      - "{sample}"
+
+    tags:
+      - processed
+      
+    # TODO: DISCUSS Consider this for directories that are handled the same but should validate contents
+    # is directory: true
+    # contents:
+    #   - ["{sample}.geneBodyCoverage.r"]
+    #   - ["{sample}.geneBodyCoverage.txt"]
+    #   - ["{sample}.geneBodyCoverage.curves.pdf"]
+
+    resource categories: *neverPublished
+
+  infer experiment out:
+    processed location: 
+      - *rseqcDataDir
+      - "03_infer_experiment"
+      - "{sample}_infer_expt.out"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  inner distance out:
+    processed location: 
+      - *rseqcDataDir
+      - "04_inner_distance"
+      - "{sample}"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  read distribution out:
+    processed location: 
+      - *rseqcDataDir
+      - "05_read_distribution"
+      - "{sample}_read_dist.out"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  RSEM counts MultiQC directory:
+    processed location: 
+      - *countsDataDir
+      - "RSEM_count_multiqc_report" # RENAMED from count_multiqc_report  as of 4/14/2022
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  RSEM counts MultiQC directory ZIP:
+    processed location: 
+      - *countsDataDir
+      - "RSEM_count_multiqc_report.zip"
+
+    tags:
+      - processed
+      
+    resource categories: &RawCountsData_MultiQCReports
+      subcategory: Raw Counts Data
+      subdirectory: Multiqc Reports
+      publish to repo: true
+      include subdirectory in table: true
+      table order: 11
+
+  star number non-zero count genes table:
+    processed location: 
+      - *alignDataDir
+      - "STAR_NumNonZeroGenes.csv"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  star unnormalized counts table:
+    processed location: 
+      - *alignDataDir
+      - "STAR_Unnormalized_Counts.csv"
+
+    tags:
+      - processed
+      
+    resource categories: &RawCountsTables
+      subcategory: Raw Counts Tables
+      subdirectory: ""
+      publish to repo: true
+      include subdirectory in table: false
+      table order: 12
+
+  rsem number non-zero count genes table:
+    processed location: 
+      - *countsDataDir
+      - "RSEM_NumNonZeroGenes.csv"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  rsem unnormalized counts table:
+    processed location: 
+      - *countsDataDir
+      - "RSEM_Unnormalized_Counts.csv" # RENAMED from 'Unnormalized_Counts.csv'
+
+    tags:
+      - processed
+      
+    resource categories: *RawCountsTables
+
+  sample reads per gene table:
+    processed location: 
+      - *alignDataDir
+      - "{sample}"
+      - "{sample}_ReadsPerGene.out.tab"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished # TODO: Discuss, should this be repo published? In what way?
+
+  sample gene counts table:
+    processed location: 
+      - *countsDataDir
+      # Removed - "{sample}", DISCUSS: Since this directory contains multiple files per sample, should this be nested in sample-wise dirs consistent with STAR and RSeQC. J.O.
+      - "{sample}.genes.results"
+
+    tags:
+      - processed
+      
+    resource categories: &RawCountsData_CountData
+      subcategory: Raw Counts Data
+      subdirectory: Count Data
+      publish to repo: true
+      include subdirectory in table: false
+      table order: 10
+
+  sample isoform counts table:
+    processed location: 
+      - *countsDataDir
+      # Removed - "{sample}", DISCUSS: Since this directory contains multiple files per sample, should this be nested in sample-wise dirs consistent with STAR and RSeQC. J.O.
+      - "{sample}.isoforms.results"
+
+    tags:
+      - processed
+            
+    resource categories: *RawCountsData_CountData
+
+  sample counts stats directory:
+    processed location: 
+      - *countsDataDir
+      # Removed - "{sample}", DISCUSS: Since this directory contains multiple files per sample, should this be nested in sample-wise dirs consistent with STAR and RSeQC. J.O.
+      - "{sample}.stat"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+  DESeq2 normalized counts table:
+    processed location: 
+      - *normCountsDataDir
+      - "Normalized_Counts.csv"
+
+    tags:
+      - processed
+            
+    resource categories: &normalizedCountsData
+      subcategory: Normalized Counts Data
+      subdirectory: ""
+      publish to repo: true
+      include subdirectory in table: false
+      table order: 13
+
+  ERCC normalized DESeq2 normalized counts table:
+    processed location: 
+      - *normCountsDataDir
+      - "ERCC_Normalized_Counts.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *normalizedCountsData
+
+  sample table:
+    processed location: 
+      - *DGEDataDir
+      - "SampleTable.csv"
+
+    tags:
+      - processed
+            
+    resource categories: &DGEAnalysisData
+      subcategory: Differential Expression Analysis Data
+      subdirectory: ""
+      publish to repo: true
+      include subdirectory in table: false
+      table order: 14
+
+  ERCC sample table:
+    processed location: 
+      - *DGEDataDir
+      - &erccSubDir "ERCC_NormDGE"
+      - "ERCCnorm_SampleTable.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *DGEAnalysisData
+
+  DESeq2 unnormalized counts table:
+    processed location: 
+      - *normCountsDataDir
+      - "RSEM_Unnormalized_Counts.csv" # RENAMED: from "Unnormalized_Counts.csv" for clarity
+
+    tags:
+      - processed
+            
+    resource categories: *neverPublished # DISCUSS: temporary name clash resolution for publishables
+
+  DESeq2 contrasts table:
+    processed location: 
+      - *DGEDataDir
+      - "contrasts.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *DGEAnalysisData
+
+  ERCC normalized DESeq2 contrasts table:
+    processed location: 
+      - *DGEDataDir
+      - *erccSubDir
+      - "ERCCnorm_contrasts.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *DGEAnalysisData
+
+  DESeq2 annotated DGE table:
+    processed location: 
+      - *DGEDataDir
+      - "differential_expression.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *DGEAnalysisData
+
+  ERCC normalized DESeq2 annotated DGE table:
+    processed location: 
+      - *DGEDataDir
+      - *erccSubDir
+      - "ERCCnorm_differential_expression.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *DGEAnalysisData
+
+  DESeq2 annotated DGE extended for viz table:
+    processed location: 
+      - *DGEDataDir
+      - "visualization_output_table.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *neverPublished
+
+  ERCC normalized DESeq2 annotated DGE extended for viz table:
+    processed location: 
+      - *DGEDataDir
+      - *erccSubDir
+      - "visualization_output_table_ERCCnorm.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *neverPublished
+
+  DESeq2 viz PCA table:
+    processed location: 
+      - *DGEDataDir
+      - "visualization_PCA_table.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *neverPublished
+
+  ERCC normalized DESeq2 viz PCA table:
+    processed location: 
+      - *DGEDataDir
+      - *erccSubDir
+      - "visualization_PCA_table_ERCCnorm.csv"
+
+    tags:
+      - processed
+            
+    resource categories: *neverPublished
+
+
+  ERCC analysis HTML:
+    processed location: 
+      - *ERCCAnalysisDir
+      - "ERCC_analysis.html"
+
+    tags:
+      - processed
+
+    conditional on dataset:
+      - has_ERCC: [True]
+            
+    resource categories:
+      subcategory: ERCC Analyses
+      subdirectory: ""
+      publish to repo: true
+      include subdirectory in table: false
+      table order: 15
+
+    # NOTE: this is while the ERCC analysis sits outside the full pipeline and
+    # once incoporated, it should be validated for existence!
+    validate exists: false 
+
+# Assets that are no longer generated by the latest pipeline
+Archived Data Assets:
+
+  # DISCUSS: When Trim Galore MQC if made clearer, publishing this should be revisited
+  # Currently this only reports the direct cutadapt related trimming and misses Trim-Galore
+  # Specific metrics.
+  # - Jonathan Oribello
+  trimming MultiQC directory ZIP:
+    processed location: 
+      - *trimDataDir
+      - *trimmingReportsDir
+      - "trimming_multiqc_report.zip"
+
+    tags:
+      - processed
+      
+    resource categories: *neverPublished
+
+
+data asset sets:
+  # These assets are not generated in the workflow, but are generated after the workflow
+  PUTATIVE:
+    - "ERCC analysis HTML"
+  glds metadata:
+    - "ISA Archive"
+  has ercc:
+    - "ERCC normalized DESeq2 normalized counts table"
+    - "ERCC sample table"
+    - "ERCC normalized DESeq2 contrasts table"
+    - "ERCC normalized DESeq2 annotated DGE table"
+    - "ERCC normalized DESeq2 annotated DGE extended for viz table"
+    - "ERCC normalized DESeq2 viz PCA table"
+    # NOTE: Not part of NF_WF yet - "ERCC analysis HTML"
+  demuliplexed paired end raw data:
+    - "runsheet"
+    - "raw forward reads fastq GZ"
+    - "raw reverse reads fastq GZ"
+  qc reports for paired end raw data:
+    - "raw forward reads fastQC HTML"
+    - "raw reverse reads fastQC HTML"
+    - "raw forward reads fastQC ZIP"
+    - "raw reverse reads fastQC ZIP"
+    - "raw MultiQC directory"
+    - "raw MultiQC directory ZIP"
+  paired end trimmed reads:
+    - "trimmed forward reads fastq GZ"
+    - "trimmed reverse reads fastq GZ"
+  qc reports for paired end trimmed reads data:
+    - "trimmed forward reads fastQC HTML"
+    - "trimmed reverse reads fastQC HTML"
+    - "trimmed forward reads fastQC ZIP"
+    - "trimmed reverse reads fastQC ZIP"
+    - "trimmed fastQC MultiQC directory"
+    - "trimming MultiQC directory"
+    - "forward reads trimming report"
+    - "reverse reads trimming report"
+  demuliplexed single end raw data:
+    - "runsheet"
+    - "raw reads fastq GZ"
+  qc reports for single end raw data:
+    - "raw reads fastQC HTML"
+    - "raw reads fastQC ZIP"
+    - "raw MultiQC directory"
+    - "raw MultiQC directory ZIP"
+  single end trimmed reads:
+    - "trimmed reads fastq GZ"
+  qc reports for single end trimmed reads data:
+    - "trimmed reads fastQC HTML"
+    - "trimmed reads fastQC ZIP"
+    - "trimmed fastQC MultiQC directory"
+    - "trimming MultiQC directory"
+    - "reads trimming report"
+  STAR alignments:
+    - "aligned MultiQC directory"
+    - "aligned MultiQC directory ZIP"
+    - "aligned ToTranscriptome Bam"
+    - "aligned SortedByCoord Bam"
+    - "aligned SortedByCoord ResortedBam"
+    - "aligned SortedByCoord ResortedBamIndex"
+    - "aligned log Final"
+    - "aligned log Progress"
+    - "aligned log Full"
+    - "aligned sjTab"
+    - "sample reads per gene table"
+    - "star number non-zero count genes table"
+    - "star unnormalized counts table"
+  RSeQC output for paired end data:
+    - "genebody coverage MultiQC directory"
+    - "genebody coverage MultiQC directory ZIP"
+    - "infer experiment MultiQC directory"
+    - "infer experiment MultiQC directory ZIP"
+    - "inner distance MultiQC directory"
+    - "inner distance MultiQC directory ZIP"
+    - "read distribution MultiQC directory"
+    - "read distribution MultiQC directory ZIP"
+    - "genebody coverage out"
+    - "infer experiment out"
+    - "inner distance out"
+    - "read distribution out"
+  RSeQC output for single end data:
+    - "genebody coverage MultiQC directory"
+    - "genebody coverage MultiQC directory ZIP"
+    - "infer experiment MultiQC directory"
+    - "infer experiment MultiQC directory ZIP"
+    - "read distribution MultiQC directory"
+    - "read distribution MultiQC directory ZIP"
+    - "genebody coverage out"
+    - "infer experiment out"
+    - "read distribution out"
+  RSEM counts:
+    - "RSEM counts MultiQC directory"
+    - "RSEM counts MultiQC directory ZIP"
+    - "rsem number non-zero count genes table"
+    - "rsem unnormalized counts table"
+    - "sample gene counts table"
+    - "sample isoform counts table"
+    - "sample counts stats directory"
+  is single end full:
+    - "runsheet"
+    - "ISA Archive"
+    - "raw MultiQC directory"
+    - "raw MultiQC directory ZIP"
+    - "raw reads fastq GZ"
+    - "raw reads fastQC HTML"
+    - "raw reads fastQC ZIP"
+    - "trimmed fastQC MultiQC directory"
+    - "trimmed fastQC MultiQC directory ZIP"
+    - "trimmed reads fastq GZ"
+    - "trimmed reads fastQC HTML"
+    - "trimmed reads fastQC ZIP"
+    - "trimming MultiQC directory"
+    - "reads trimming report"
+    - "aligned MultiQC directory"
+    - "aligned MultiQC directory ZIP"
+    - "aligned ToTranscriptome Bam"
+    - "aligned SortedByCoord Bam"
+    - "aligned SortedByCoord ResortedBam"
+    - "aligned SortedByCoord ResortedBamIndex"
+    - "aligned log Final"
+    - "aligned log Progress"
+    - "aligned log Full"
+    - "aligned sjTab"
+    - "genebody coverage MultiQC directory"
+    - "genebody coverage MultiQC directory ZIP"
+    - "infer experiment MultiQC directory"
+    - "infer experiment MultiQC directory ZIP"
+    - "read distribution MultiQC directory"
+    - "read distribution MultiQC directory ZIP"
+    - "genebody coverage out"
+    - "infer experiment out"
+    - "read distribution out"
+    - "RSEM counts MultiQC directory"
+    - "RSEM counts MultiQC directory ZIP"
+    - "star number non-zero count genes table"
+    - "star unnormalized counts table"
+    - "rsem number non-zero count genes table"
+    - "rsem unnormalized counts table"
+    - "sample reads per gene table"
+    - "sample gene counts table"
+    - "sample isoform counts table"
+    - "sample counts stats directory"
+    - "DESeq2 normalized counts table"
+    - "sample table"
+    - "DESeq2 unnormalized counts table"
+    - "DESeq2 contrasts table"
+    - "DESeq2 annotated DGE table"
+    - "DESeq2 annotated DGE extended for viz table"
+    - "DESeq2 viz PCA table"
+  is paired end full:
+    - "runsheet"
+    - "ISA Archive"
+    - "raw MultiQC directory"
+    - "raw MultiQC directory ZIP"
+    - "raw forward reads fastq GZ"
+    - "raw reverse reads fastq GZ"
+    - "raw forward reads fastQC HTML"
+    - "raw reverse reads fastQC HTML"
+    - "raw forward reads fastQC ZIP"
+    - "raw reverse reads fastQC ZIP"
+    - "trimmed fastQC MultiQC directory"
+    - "trimmed fastQC MultiQC directory ZIP"
+    - "trimmed forward reads fastq GZ"
+    - "trimmed reverse reads fastq GZ"
+    - "trimmed forward reads fastQC HTML"
+    - "trimmed reverse reads fastQC HTML"
+    - "trimmed forward reads fastQC ZIP"
+    - "trimmed reverse reads fastQC ZIP"
+    - "trimming MultiQC directory"
+    - "forward reads trimming report"
+    - "reverse reads trimming report"
+    - "aligned MultiQC directory"
+    - "aligned MultiQC directory ZIP"
+    - "aligned ToTranscriptome Bam"
+    - "aligned SortedByCoord Bam"
+    - "aligned SortedByCoord ResortedBam"
+    - "aligned SortedByCoord ResortedBamIndex"
+    - "aligned log Final"
+    - "aligned log Progress"
+    - "aligned log Full"
+    - "aligned sjTab"
+    - "genebody coverage MultiQC directory"
+    - "genebody coverage MultiQC directory ZIP"
+    - "infer experiment MultiQC directory"
+    - "infer experiment MultiQC directory ZIP"
+    - "inner distance MultiQC directory"
+    - "inner distance MultiQC directory ZIP"
+    - "read distribution MultiQC directory"
+    - "read distribution MultiQC directory ZIP"
+    - "genebody coverage out"
+    - "infer experiment out"
+    - "inner distance out"
+    - "read distribution out"
+    - "RSEM counts MultiQC directory"
+    - "RSEM counts MultiQC directory ZIP"
+    - "star number non-zero count genes table"
+    - "star unnormalized counts table"
+    - "rsem number non-zero count genes table"
+    - "rsem unnormalized counts table"
+    - "sample reads per gene table"
+    - "sample gene counts table"
+    - "sample isoform counts table"
+    - "sample counts stats directory"
+    - "DESeq2 normalized counts table"
+    - "sample table"
+    - "DESeq2 unnormalized counts table"
+    - "DESeq2 contrasts table"
+    - "DESeq2 annotated DGE table"
+    - "DESeq2 annotated DGE extended for viz table"
+    - "DESeq2 viz PCA table"
+  DGE Output:
+    - "DESeq2 normalized counts table"
+    - "sample table"
+    - "DESeq2 unnormalized counts table"
+    - "DESeq2 contrasts table"
+    - "DESeq2 annotated DGE table"
+    - "DESeq2 annotated DGE extended for viz table"
+    - "DESeq2 viz PCA table"
+  ERCC DGE Output:
+    - "ERCC normalized DESeq2 normalized counts table"
+    - "ERCC sample table"
+    - "ERCC normalized DESeq2 contrasts table"
+    - "ERCC normalized DESeq2 annotated DGE table"
+    - "ERCC normalized DESeq2 annotated DGE extended for viz table"
+    - "ERCC normalized DESeq2 viz PCA table"
+    # NOTE: Not part of NF_WF yet - "ERCC analysis HTML"
+  RSEM Output:
+    - "RSEM counts MultiQC directory"
+    - "RSEM counts MultiQC directory ZIP"
+    - "rsem number non-zero count genes table"
+    - "rsem unnormalized counts table"
\ No newline at end of file
diff --git a/tests/assets/test.yaml b/tests/assets/test.yaml
new file mode 100644
index 0000000..31af719
--- /dev/null
+++ b/tests/assets/test.yaml
@@ -0,0 +1,3 @@
+'OSD-#': 194
+'GLDS-#': 194
+'Data Source': MANUAL
diff --git a/tests/test_metrics_extractions.py b/tests/test_metrics_extractions.py
new file mode 100644
index 0000000..7f570ed
--- /dev/null
+++ b/tests/test_metrics_extractions.py
@@ -0,0 +1,86 @@
+from pathlib import Path
+
+import pytest
+
+from dp_tools.core.utilites.metrics_extractor import (
+    generate_extractor_from_yaml_config,
+    AssayType,
+)
+
+
+@pytest.fixture
+def test_yaml():
+    # Make the path relative to this file
+    TEST_DIR = Path(__file__).parent
+    return TEST_DIR / "assets/test.yaml"
+
+
+@pytest.fixture
+def configuration_yaml():
+    # Make the path relative to this file
+    TEST_DIR = Path(__file__).parent
+    return TEST_DIR / "assets/config.yaml"
+
+
+@pytest.fixture
+def OSD_576_metrics_csv():
+    TEST_DIR = Path(__file__).parent
+    return TEST_DIR / "assets/OSD-576_on_cluster_metrics.csv"
+
+
+@pytest.fixture
+def OSD_281_metrics_csv():
+    TEST_DIR = Path(__file__).parent
+    return TEST_DIR / "GLDS-281_on_cluster_metrics.csv"
+
+
+def test_extract_general_information(test_yaml):
+    MetricsExtractor().extract_general_information(assay_type=1, yaml_file=test_yaml)
+    pass
+
+
+def test_isa_to_yaml(glds194_test_dir, test_yaml, configuration_yaml):
+    metricsExtractor = generate_extractor_from_yaml_config(config=configuration_yaml)
+
+    metricsExtractor.extract_data_from_isa(
+        accession="GLDS-194",
+        isa_archive=glds194_test_dir / "Metadata/GLDS-194_metadata_GLDS-194-ISA.zip",
+        config=Path("/workspace/metrics_bulkRNASeq.yaml"),
+    )
+
+    metricsExtractor.append_manual_yaml_data(target_yaml=test_yaml)
+
+    metricsExtractor.extract_sections()
+
+    assert (
+        set(
+            [
+                "has_ERCC",
+                "organism",
+                "Tissue Type",
+                "Library Prep Method",
+                "PE or SE",
+                "Stranded or Unstranded",
+                "% rRNA contamination",
+                "Original Sample Name",
+                "OSD-#",
+                "GLDS-#",
+                "Data Source",
+            ]
+        ).difference(set(metricsExtractor.metrics))
+        == set()
+    )
+
+    metricsExtractor.metrics.to_csv(glds194_test_dir / "test.csv")
+
+    metricsExtractor.process_metrics(assay_type=AssayType.bulkRNASeq)
+
+
+def test_load_and_process_metrics_table(configuration_yaml, OSD_576_metrics_csv):
+    metricsExtractor = generate_extractor_from_yaml_config(config=configuration_yaml)
+
+    metricsExtractor.load_metrics_csv(metrics_csv=OSD_576_metrics_csv)
+
+    metricsExtractor.process_metrics(assay_type=AssayType.bulkRNASeq).to_csv(
+        "/workspace/OSD_576_metrics_summary.csv"
+    )