Merge pull request #237 from ncihtan/236-fix-missing-mandatory-attrib…

…utes-for-bulk-metlylation-sequencing-l1
ncihtan · Jun 20, 2023 · 857d3ac · 857d3ac
2 parents 9f7c221 + 7a43555
commit 857d3ac
Show file tree

Hide file tree

Showing 4 changed files with 100 additions and 18 deletions.
diff --git a/.github/workflows/check_components.py b/.github/workflows/check_components.py
@@ -0,0 +1,46 @@
+import pandas as pd
+import sys
+model = pd.read_csv('HTAN.model.csv')
+
+errors = []
+
+# Level 1 test
+
+level1 =  model[model.Attribute.str.contains('Level 1$')][['Attribute','DependsOn']]
+
+level1['DependsOnList'] = level1['DependsOn'].map(lambda x: x.split(','))
+level1['DependsOnList'] = level1['DependsOnList'].map(lambda x: list(map(str.strip,x)))
+
+l1_required = ['Component', 'Filename', 'File Format', 'HTAN Parent Biospecimen ID','HTAN Data File ID']
+
+for req in l1_required:
+    for i, row in level1.iterrows():
+        if req in row['DependsOnList']:
+            pass
+        else:
+            e = f'{req} is missing from DependsOn for attribute {row["Attribute"]}'
+            errors.append(e)
+
+# Level 2, 3 and 4 test
+level234 =  model[model.Attribute.str.contains(r'Level [2|3|4]$')][['Attribute','DependsOn']]
+
+level234['DependsOnList'] = level234['DependsOn'].map(lambda x: x.split(','))
+level234['DependsOnList'] = level234['DependsOnList'].map(lambda x: list(map(str.strip,x)))
+
+level234_required = ['Component', 'Filename', 'File Format', 'HTAN Parent Data File ID','HTAN Data File ID']
+
+for req in level234_required:
+    for i, row in level234.iterrows():
+        if req in row['DependsOnList']:
+            pass
+        elif (row['Attribute'] in ['Imaging Level 2','SRRS Imaging Level 2']) and (req == 'HTAN Parent Data File ID'):
+            #print(f'Skipping {req} as {row["Attribute"]} is one of Imaging Level 2 or SRRS Imaging Level 2')
+            pass
+        else:
+            e = f'{req} is missing from DependsOn for attribute {row["Attribute"]}'
+            errors.append(e)
+
+if len(errors) == 0:
+    pass
+else:
+    sys.exit(errors)
diff --git a/.github/workflows/ci-lint-validate-convert.yml b/.github/workflows/ci-lint-validate-convert.yml
@@ -50,11 +50,32 @@ jobs:
           schematic model -c .github/CSV_schematic_config.yml validate -mp HTAN.model.csv -dt "DataModel" | 
             grep "Your manifest has been validated successfully. There are no errors in your manifest, and it can be submitted without any modifications."
 
+  component_check:
+    name: Check Component DependsOn entries
+    runs-on: ubuntu-latest
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4 
+        with:
+          python-version: '3.10'
+
+      - name: Install Python dependencies
+        run: python -m pip install --upgrade pip pandas
+
+      - name: Run script
+        shell: bash
+        run: |
+          python .github/workflows/check_components.py
+
   convert:
     name: Convert CSV to JSON-LD
     needs:
       - lint
       - validate
+      - component_check
     runs-on: ubuntu-latest
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}

diff --git a/HTAN.model.csv b/HTAN.model.csv
@@ -58,10 +58,10 @@ scmC-seq Level 1,Files contain raw scmC-seq data.,,"Component, Filename, File Fo
 scmC-seq Level 2,"Files contain scmC-seq files containing aligned sequence data, as a BAM file.",,"Component, Filename, File Format, HTAN Parent Data File ID, HTAN Data File ID, Alignment Workflow Url, Alignment Workflow Type, Genomic Reference, Genomic, Reference URL, Index File Name, Average Base Quality, Average Insert Size, Average Read Length, Contamination, Contamination Error, Mean Coverage, Pairs On Diff CHR, Total Reads, Total Uniquely Mapped, Total Unmapped reads, Proportion Reads Duplicated, Proportion Reads Mapped, Proportion Targets No Coverage, Proportion Base Mismatch, Short Reads, Proportion Reads Mapped, Proportion Targets No Coverage",,FALSE,Sequencing,scmC-seq Level 1,,
 scATAC-seq Level 4,"Data represents the relationships between cells derived from Level 3 expression data and shown as tSNE or UMAP coordinates per cell, plus all other cell-specific meta information (e.g., cell type)",,"Component, Filename, File Format, HTAN Parent Data File ID, HTAN Data File ID, scATACseq Workflow Type, scATACseq Workflow Parameters Description, Workflow Version, Workflow Link",,FALSE,Sequencing,scATAC-seq Level 3,,
 scDNA-seq Level 1,Single-cell DNA-seq,,"Component, Filename, File Format, HTAN Parent Biospecimen ID, HTAN Data File ID, Sequencing Batch ID, Library Layout, Nucleic Acid Source, Library Selection Method, Read Length, Library Preparation Kit Name, Library Preparation Kit Vendor, Library Preparation Kit Version, Adapter Name, Adapter Sequence, Base Caller Name, Base Caller Version, Flow Cell Barcode, Fragment Maximum Length, Fragment Mean Length, Fragment Minimum Length, Fragment Standard Deviation Length, Lane Number, Library Strand, Multiplex Barcode, Size Selection Range, Target Depth, To Trim Adapter Sequence, Adapter Content, Basic Statistics, Encoding, Kmer Content, Overrepresented Sequences, Per Base N Content, Per Base Sequence Content, Per Base Sequence Quality, Per Sequence GC Content, Per Sequence Quality Score, Per Tile Sequence Quality, Percent GC Content, Sequence Duplication Levels, Sequence Length Distribution, Total Reads, QC Workflow Type, QC Workflow Version, QC Workflow Link",,FALSE,Sequencing,Biospecimen,,
-scDNA-seq Level 2,Alignment workflows downstream of scDNA-seq Level 1,,"Component, Filename, File Format, HTAN Parent Data File ID, HTAN Parent Data File ID, Alignment Workflow Url, Alignment Workflow Type, Genomic Reference, Genomic Reference URL, Index File Name, Average Base Quality, Average Insert Size, Average Read Length, Mean Coverage, Pairs On Diff CHR, Total Reads, Proportion Reads Mapped, MapQ30, Total Uniquely Mapped, Total Unmapped reads,Proportion Reads Duplicated, Short Reads, Proportion Coverage 10X, Proportion Coverage 30X, Proportion Targets No Coverage, Proportion Base Mismatch, Proportion Mitochondrial Reads, Contamination, Contamination Error",,FALSE,Sequencing,scDNA-seq Level 1,,
-Bulk Methylation-seq Level 1,"Raw data for bulk methylation sequencing, such as FASTQs and unaligned BAMs",,"Component, Filename, File Format, HTAN Biospecimen ID, Biospecimen Preparation, Nucleic Acid Source, Bisulfite Conversion, Sequencing Platform, Replicate Type, Bulk Methylation Assay Type, Total DNA Input",,FALSE,Sequencing,Biospecimen,,
-Bulk Methylation-seq Level 2,"Aligned primary data for bulk methylation sequencing, such as gene expression matrix files, VCFs, etc.",,"Component, Filename, File Format, HTAN Biospecimen ID, HTAN Parent Data File ID, Alignment Workflow Url, Trimmer, Bulk Methylation Genomic Reference, Genomic Reference URL, Index File Name, Alignment Workflow Type, Duplicate Removal Software, Mean Coverage, Library Layout, Average Base Quality, Average Insert Size, Average Read Length, Contamination, Contamination Error, Pairs On Diff CHR, Total Reads, Total Uniquely Mapped, Total Unmapped reads, Proportion Reads Duplicated, Proportion Reads Mapped, Proportion Targets No Coverage, Proportion Base Mismatch, Short Reads, Proportion of Minimum CpG Coverage 10X, Proportion Coverage 30X, Proportion Reads Duplicated, Proportion Targets No Coverage",,FALSE,Sequencing,"Bulk Methylation-seq Level 1, Biospecimen",,
-Bulk Methylation-seq Level 3,"Sample level summary data for bulk methylation sequencing, such as t-SNE plot coordinates, etc.",,"Component, Filename, File Format, HTAN Biospecimen ID, HTAN Parent Biospecimen ID, HTAN Parent Data File ID, DMC Calling Tool, DMC Calling Workflow URL, DMR Calling Tool, DMR Calling Workflow URL, pUC19 methylation ratio, Lambda methylation ratio, DMC data file format, DMR data file Format",,FALSE,Sequencing,"Bulk Methylation-seq Level 2, Biospecimen",,
+scDNA-seq Level 2,Alignment workflows downstream of scDNA-seq Level 1,,"Component, Filename, File Format, HTAN Parent Data File ID, HTAN Data File ID, Alignment Workflow Url, Alignment Workflow Type, Genomic Reference, Genomic Reference URL, Index File Name, Average Base Quality, Average Insert Size, Average Read Length, Mean Coverage, Pairs On Diff CHR, Total Reads, Proportion Reads Mapped, MapQ30, Total Uniquely Mapped, Total Unmapped reads,Proportion Reads Duplicated, Short Reads, Proportion Coverage 10X, Proportion Coverage 30X, Proportion Targets No Coverage, Proportion Base Mismatch, Proportion Mitochondrial Reads, Contamination, Contamination Error",,FALSE,Sequencing,scDNA-seq Level 1,,
+Bulk Methylation-seq Level 1,"Raw data for bulk methylation sequencing, such as FASTQs and unaligned BAMs",,"Component, Filename, File Format, HTAN Parent Biospecimen ID, HTAN Data File ID, Biospecimen Preparation, Nucleic Acid Source, Bisulfite Conversion, Sequencing Platform, Replicate Type, Bulk Methylation Assay Type, Total DNA Input",,FALSE,Sequencing,Biospecimen,,
+Bulk Methylation-seq Level 2,"Aligned primary data for bulk methylation sequencing, such as gene expression matrix files, VCFs, etc.",,"Component, Filename, File Format, HTAN Parent Data File ID, HTAN Data File ID, Alignment Workflow Url, Trimmer, Bulk Methylation Genomic Reference, Genomic Reference URL, Index File Name, Alignment Workflow Type, Duplicate Removal Software, Mean Coverage, Library Layout, Average Base Quality, Average Insert Size, Average Read Length, Contamination, Contamination Error, Pairs On Diff CHR, Total Reads, Total Uniquely Mapped, Total Unmapped reads, Proportion Reads Duplicated, Proportion Reads Mapped, Proportion Targets No Coverage, Proportion Base Mismatch, Short Reads, Proportion of Minimum CpG Coverage 10X, Proportion Coverage 30X, Proportion Reads Duplicated, Proportion Targets No Coverage",,FALSE,Sequencing,"Bulk Methylation-seq Level 1, Biospecimen",,
+Bulk Methylation-seq Level 3,"Sample level summary data for bulk methylation sequencing, such as t-SNE plot coordinates, etc.",,"Component, Filename, File Format, HTAN Parent Data File ID, HTAN Data File ID,DMC Calling Tool, DMC Calling Workflow URL, DMR Calling Tool, DMR Calling Workflow URL, pUC19 methylation ratio, Lambda methylation ratio, DMC data file format, DMR data file Format",,FALSE,Sequencing,"Bulk Methylation-seq Level 2, Biospecimen",,
 Imaging Level 1,Raw imaging data,,"Component, Filename, File Format, HTAN Parent Biospecimen ID, HTAN Data File ID, Imaging Assay Type, Protocol Link, Software and Version, Commit SHA, Pre-processing Completed, Pre-processing Required, Comment",,FALSE,Assay,Biospecimen,,
 Imaging Level 2,Raw and pre-processed image data,,"Component, Filename, File Format, HTAN Participant ID, HTAN Parent Biospecimen ID,  HTAN Data File ID, Channel Metadata Filename, Imaging Assay Type, Protocol Link, Software and Version, Microscope, Objective, NominalMagnification, LensNA, WorkingDistance,WorkingDistanceUnit, Immersion, Pyramid, Zstack, Tseries, Passed QC, Comment, FOV number, FOVX, FOVXUnit, FOVY, FOVYUnit, Frame Averaging, Image ID, DimensionOrder, PhysicalSizeX, PhysicalSizeXUnit, PhysicalSizeY, PhysicalSizeYUnit, PhysicalSizeZ, PhysicalSizeZUnit, Pixels BigEndian, PlaneCount, SizeC, SizeT, SizeX, SizeY, SizeZ, PixelType, MERFISH Positions File, MERFISH Codebook File",,FALSE,Assay,Imaging Level 1,,
 MERFISH Positions File,The positions file is an auxiliary MERFISH file that describes the location of bead positions in the assay.,,,,FALSE,Assay,,,
@@ -97,8 +97,8 @@ Sequencing Saturation,"The fraction of reads originating from an already-observe
 Proportion Reads Mapped to Transcriptome,Fraction of reads that mapped to a unique gene in the transcriptome. The read must be consistent with annotated splice junctions. These reads are considered for UMI counting.,,,,TRUE,Spatial Transcriptomics,,,
 Median Number Genes per Spatial Spot,The median number of genes detected per spot under tissue-associated barcode. Detection is defined as the presence of at least 1 UMI count.,,,,TRUE,Spatial Transcriptomics,,,num
 HI-C-seq Level 1,Unaligned sequence data,,"Component, HTAN Parent Biospecimen ID, HTAN Data File ID, Filename, File Format, Genomic Reference, Sequencing Platform, Nucleic Acid Source, Technical Replicate Group, Transposition Reaction, Crosslinking Condtion, DNA Digestion Condition, Nuclei Permeabilization Method, Ligation Condition, Biotin Enrichment, DNA Input Amount, Total Reads, Protocol Link",,FALSE,Sequencing,Biospecimen,,
-HI-C-seq Level 2,"Aligned read pairs, contact matrix",,"Component, HTAN Data File ID, HTAN Parent Data File ID, Filename, Genomic Reference, Aligned Read Length, Tool, Resolution, Normalization Method",,FALSE,Sequencing,HI-C-seq Level 1,,
-HI-C-seq Level 3,Summary data for the HI-C-seq assay.,,"Component, HTAN Parent Data File ID, HTAN Data File ID, Filename,  Genomic Reference, Stripe Calling, Loop Window, Stripe Window, Loop Calling",,FALSE,Sequencing,HI-C-seq Level 2,,
+HI-C-seq Level 2,"Aligned read pairs, contact matrix",,"Component, HTAN Data File ID, HTAN Parent Data File ID, Filename, File Format, Genomic Reference, Aligned Read Length, Tool, Resolution, Normalization Method",,FALSE,Sequencing,HI-C-seq Level 1,,
+HI-C-seq Level 3,Summary data for the HI-C-seq assay.,,"Component, HTAN Parent Data File ID, HTAN Data File ID, Filename, File Format, Genomic Reference, Stripe Calling, Loop Window, Stripe Window, Loop Calling",,FALSE,Sequencing,HI-C-seq Level 2,,
 Crosslinking Condtion,Detailed condition for DNA crosslinking,,,,TRUE,Sequencing,,,
 DNA Digestion Condition,Enzymes and treatment length/temperature for genome digestion,,,,TRUE,Sequencing,,,
 Nuclei Permeabilization Method,Detergent and treatment condition for nuclei permeabilization and crosslinking softening,,,,TRUE,Sequencing,,,
@@ -112,10 +112,10 @@ Stripe Window,"Binning size used for calling significant architectural stripes.
 Loop Calling,Tool used for identifying loop interactions,,,,TRUE,Sequencing,,,
 Imaging Level 4,Derived imaging data: Object-by-feature array,,"Component, Filename, File Format, HTAN Parent Data File ID, HTAN Parent Channel Metadata ID, HTAN Data File ID, Parameter file, Software and Version, Commit SHA,Number of Objects, Number of Features,Imaging Object Class, Imaging Summary Statistic",,FALSE,Assay,Imaging Level 3 Channels,,
 SRRS Imaging Level 2,SRRS-specific HTAN raw and pre-processed image data,,"Component, Filename, File Format, HTAN Participant ID, HTAN Parent Biospecimen ID,  HTAN Data File ID, Channel Metadata Filename, Imaging Assay Type, Protocol Link, Software and Version, Microscope, Objective, NominalMagnification, Pyramid, Zstack, Tseries, Passed QC, Frame Averaging, Image ID, DimensionOrder, PhysicalSizeX, PhysicalSizeXUnit, PhysicalSizeY, PhysicalSizeYUnit, Pixels BigEndian, PlaneCount, SizeC, SizeT, SizeX, SizeY, SizeZ, PixelType",,FALSE,Assay,Biospecimen,,
-RPPA Level 2,Array based protemics. Each dilution curve of spot intensities is fitted using the monotone increasing B-spline model in the SuperCurve R package. This fits a single curve using all the samples on a slide with the signal intensity as the response variable and the dilution steps as independent variables. The fitted curve is plotted with the signal intensities on the y-axis and the log2-concentration of proteins on the x-axis for diagnostic purposes.,,"Component, Filename, File Format, HTAN Participant ID, HTAN Parent Biospecimen ID,  HTAN Parent Data File ID, HTAN RPPA Antibody Table, Assay Type, Protocol Link, Software and Version",,FALSE,Assay,Biospecimen,,
+RPPA Level 2,Array based protemics. Each dilution curve of spot intensities is fitted using the monotone increasing B-spline model in the SuperCurve R package. This fits a single curve using all the samples on a slide with the signal intensity as the response variable and the dilution steps as independent variables. The fitted curve is plotted with the signal intensities on the y-axis and the log2-concentration of proteins on the x-axis for diagnostic purposes.,,"Component, Filename, File Format, HTAN Participant ID, HTAN Parent Biospecimen ID,  HTAN Parent Data File ID, HTAN Data File ID, HTAN RPPA Antibody Table, Assay Type, Protocol Link, Software and Version",,FALSE,Assay,Biospecimen,,
 HTAN RPPA Antibody Table,A table containing antibody level metadata for RPPA,,"HTAN RPPA Antibody Table ID, Filename, File Format, Ab Name Reported on Dataset, GENCODE Gene Symbol Target, UNIPROT Protein ID Target, Phosphoprotein Flag, Vendor, Catalog Number, Internal Ab ID, Species, RPPA Dilution, Phospho Site, RPPA Validation Status, Clone, Clonality, Notes",,TRUE,RPPA Level 2,RPPA Level 2,,
-RPPA Level 3,Level 3 Reverse Phase Protein Array (RPPA) data contains intra-batch normalized intensities.,,"Component, Filename, File Format, HTAN Participant ID, HTAN Parent Biospecimen ID,  HTAN Parent Data File ID,  Assay Type, Software and Version, Normalization Method",,FALSE,Assay,Biospecimen,,
-RPPA Level 4,Level 4 Reverse Phase Protein Array (RPPA) data contains intra-batch corrected intensities.,,"Component, Filename, File Format, HTAN Participant ID, HTAN Parent Biospecimen ID, HTAN Parent Data File ID, Assay Type, Batch Correction Method",,FALSE,Assay,RPPA Level 2,,
+RPPA Level 3,Level 3 Reverse Phase Protein Array (RPPA) data contains intra-batch normalized intensities.,,"Component, Filename, File Format, HTAN Participant ID, HTAN Parent Biospecimen ID,  HTAN Parent Data File ID, HTAN Data File ID, Assay Type, Software and Version, Normalization Method",,FALSE,Assay,Biospecimen,,
+RPPA Level 4,Level 4 Reverse Phase Protein Array (RPPA) data contains intra-batch corrected intensities.,,"Component, Filename, File Format, HTAN Participant ID, HTAN Parent Biospecimen ID, HTAN Parent Data File ID, HTAN Data File ID, Assay Type, Batch Correction Method",,FALSE,Assay,RPPA Level 2,,
 Mass Spectrometry Level 1,"Mass Spectrometry derived data that includes proteomics, metabolomics, and lipidomics, level 1",,"Component, Filename, File Format, HTAN Parent Biospecimen ID, HTAN Data File ID, MS Batch ID, MS-based Assay Type, Analyte Type, MS-based Targeted, Instrument Make and Model, MS Source, Polarity, Mass Range Low Value, Mass Range High Value, Data Collection Mode, MS Scan Mode, MS Labeling, Protocol Link, LC Instrument Vendor and Model, LC Column Vendor and Model, LC Resin, LC Length Value, LC Temp Value, LC ID Value, LC Flow Rate, LC Gradient, LC Mobile Phase A, LC Mobile Phase B, Software and Version, Protocol Link",,FALSE,Assay,Biospecimen,,
 Mass Spectrometry Level 2,"Mass Spectrometry derived data that includes proteomics, metabolomics, and lipidomics, level 2",,"Component, Filename, File Format, HTAN Data File ID, HTAN Parent Biospecimen ID, HTAN Parent Data File ID, MS Assay Category, Software and Version, Mass Spectrometry Auxiliary File",,FALSE,Assay,Mass Spectrometry Level 1,,
 Mass Spectrometry Level 3,"Mass Spectrometry derived data that includes proteomics, metabolomics, and lipidomics, level 3",,"Component, Filename, File Format, HTAN Data File ID, HTAN Parent Biospecimen ID, HTAN Parent Data File ID, MS Assay Category, Software and Version, Mass Spectrometry Auxiliary File",,FALSE,Assay,Mass Spectrometry Level 2,,