Merge pull request #79 from icgc-argo/update_metadata_dict

update dict and template
icgc-argo · Jun 9, 2023 · 2eae399 · 2eae399
2 parents 4c76703 + 8521955
commit 2eae399
Show file tree

Hide file tree

Showing 6 changed files with 36 additions and 34 deletions.
diff --git a/metadata_dictionary/experiment_dict.tsv b/metadata_dictionary/experiment_dict.tsv
@@ -1,23 +1,25 @@
 Field	Attribute	Description	Permissible Values	Note
 type	Required	table type	sequencing_experiment	
 submitter_sequencing_experiment_id	Required	Unique identifier of the sequencing experiment, assigned by the data provider.	String values that meet the regular expression ^[a-zA-Z0-9]{1}[a-zA-Z0-9\\-_\\.:']{0,98}[a-zA-Z0-9]{1}$	
-program_id	Required	ARGO Program ID, the unique identifier of your program. If you have logged into the ARGO Data Platform, this is the Program ID that you see in the Program Services area.		Must be the same as what is in sample_registration table
-submitter_donor_id	Required	Unique identifier of the donor, assigned by the data provider.	Values must meet the regular expression  ^[A-Za-z0-9\-\._]{1,64}	Must be the same as what is in sample_registration table
-submitter_specimen_id	Required	Unique identifier of the specimen, assigned by the data provider.	Values must meet the regular expression ^[A-Za-z0-9\-\._]{1,64}	Must be the same as what is in sample_registration table
-submitter_sample_id	Required	Unique identifier of the sample, assigned by the data provider. If submitted along with BAM molecular data, must also be present in header SM.	Values must meet the regular expression  ^[A-Za-z0-9\-\._]{1,64}	Must be the same as what is in sample_registration table
-submitter_matched_normal_sample_id	Required	Provide the identifier of matched normal sample used for data analysis.       	 Values must meet the regular expression ^[A-Za-z0-9\-\._]{1,64} or null	Required for WGS and WXS tumour samples
+program_id	Required	ARGO Program ID, the unique identifier of your program. If you have logged into the ARGO Data Platform, this is the Program ID that you see in the Program Services area.		Must be the same as what are in sample_registration table submitted to ARGO platform.
+submitter_donor_id	Required	Unique identifier of the donor, assigned by the data provider.	Values must meet the regular expression ^[A-Za-z0-9\-\._]{1,64}	Must be the same as what are in sample_registration table submitted to ARGO platform.
+submitter_specimen_id	Required	Unique identifier of the specimen, assigned by the data provider.	Values must meet the regular expression ^[A-Za-z0-9\-\._]{1,64}	Must be the same as what are in sample_registration table submitted to ARGO platform.
+submitter_sample_id	Required	Unique identifier of the sample, assigned by the data provider. If submitted along with BAM molecular data, must also be present in header SM.	Values must meet the regular expression ^[A-Za-z0-9\-\._]{1,64}	Must be the same as what are in sample_registration table submitted to ARGO platform.
+submitter_matched_normal_sample_id	Conditional Required	Provide the identifier of matched normal sample used for data analysis.	Values must meet the regular expression ^[A-Za-z0-9\-\._]{1,64} or empty(null)	Required for WGS/WXS tumour samples
 read_group_count	Required	The number of read groups in the molecular files being submitted.	A minimum of 1 is required.	
-platform	Required	The sequencing platform type used in data generation. Can also be specified within Bam header PL.	CAPILLARY, LS454, ILLUMINA, SOLID, HELICOS, IONTORRENT, ONT, PACBIO, Nanopore, BGI	
+platform	Required	The sequencing platform type used in data generation.	CAPILLARY, LS454, ILLUMINA, SOLID, HELICOS, IONTORRENT, ONT, PACBIO, Nanopore, BGI	
 experimental_strategy	Required	The primary experimental method. For sequencing data it refers to how the sequencing library was made.	WGS, WXS, RNA-Seq, Bisulfite-Seq, ChIP-Seq, Targeted-Seq	
-sequencing_date	Required	Date sequencing was performed.	datetime format, for example: 2019-06-16 or 2019-06-16T20:20:39+00:00 or null	
-platform_model	Required	The model number of the sequencing machine used in data generation. Can also be specified within Bam header PM.	Any string value or null	
-sequencing_center	Required	Data centre sequencing was performed. Can also be specified with Bam header CN.	Any string value or null	
-target_capture_kit	Optional	Description that can uniquely identify a target capture kit. 	xGen Exome Research Panel V1 (IDT), SeqCap EZ MedExome (Roche), SureSelect Human All Exon V6 (Agilent), Human Core Exome Kit + RefSeq V1 (Twist) null	
-library_isolation_protocol	Optional	Provide the protocol used to isolate RNAs	TRIzol Reagent (Thermo Fisher), RNeasy kits (QIAGEN), RNase free DNase I (Thermo Fisher), Pico Pure RNA isolation kit (Thermo Fisher), mirVANA microRNA isolation kit (Thermo Fisher), Absolutely Total RNA, miRNA & mRNA Purification Kits (Stratagene, Agilent technologies), SV total RNA isolation kit (Promega), RNAqueous Kit (Thermo Fisher), AllPrep DNA/RNA Micro Kit (QIAGEN), GenElute Mammalian Total RNA Miniprep kit (MilliporeSigma), Spectrum Plant Total RNA kit (MilliporeSigma), peqGOLD Total RNA kits (PeqLab Biotechnologie), RNAlater (Thermo Fisher) null	
-library_preparation_kit	Optional	Provide the kit being used for library construction	Ovation SoLo kit (NuGEN), SMARTer Stranded Total RNA-Seq Kit (Takara), TruSeq RNA sample preparation v2 (Illumina), SMART-Seq v4 Ultra Low Input RNA Kit (Takara), Nextera XT DNA Library Preparation Kit (Illumina), NEXTflex kit (Bioo Scientific) null	
-library_strandedness	Conditional Required	Indicate the data strandedness	UNSTRANDED, FIRST_READ_SENSE_STRAND, FIRST_READ_ANTISENSE_STRAND null	Required for RNA-Seq
-rin	Optional	RNA integrity number	A number between 1 to 10 or null	
-dv200	Optional	The percentage of RNA fragments that are >200 nucleotides in size	A percentage or null	
-spike_ins_included	Optional	Indicate if include spike ins?	true, false	
-spike_ins_fasta	Optional	Name of FASTA file that contains the spike-in sequences	Any string value or null. Must match a fileName identified in the files section.	
-spike_ins_concentration	Optional	Spike in concentration	String or null	
+sequencing_date	Optional	The date of sequencing	datetime format, for example: 2019-06-16 or 2019-06-16T20:20:39+00:00 or empty(null)	
+platform_model	Optional	The model number of the sequencing machine used in data generation.	Any string value or empty(null)	
+sequencing_center	Optional	Data centre sequencing was performed. Can also be specified with Bam header CN.	Any string value or empty(null)	
+target_capture_kit	Conditional Required	Description that can uniquely identify a target capture kit. Suggested value is a combination of vendor, kit name, and kit version.	Any string value or empty(null)	Required for Targeted-Seq /WXS 
+primary_target_regions	Conditional Required	A bed file which holds the biologically relevant target regions (based on a genome, e.g. GRCh38) to capture by the assay.	Customized Enum values which can be mapped to fileName and fileURL	Required for Targeted-Seq /WXS
+capture_target_regions	Conditional Required	A bed file which holds the technically relevant probes region to capture by the assay.	Customized Enum values which can be mapped to fileName and fileURL	Required for Targeted-Seq /WXS
+number_of_genes	Optional	Number of genes the assay is targeting	Integer with a minimum value of 1 or empty(null).	Optional for Targeted-Seq
+gene_padding	Optional	Number of basepairs to add to exon endpoints for the inBED filter	Integer with a minimum value of 0 or empty(null).	Optional for Targeted-Seq
+coverage	Optional	List of coverage	Hotspot Regions, Coding Exons, Introns, Promoters, or empty(null)	Optional for Targeted-Seq
+library_selection	Optional	The method used to select and/or enrich the material being sequenced.	Affinity Enrichment, Hybrid Selection, miRNA Size Fractionation, PCR-based Enrichment, Poly-T Enrichment, Random, rRNA Depletion, Molecular Inversion Probes, or empty(null)	Optional for Targeted-Seq/WXS/RNA-Seq
+library_preparation_kit	Optional	Provide the kit information being used for library construction. Suggested value is a combination of vendor, kit name, and kit version.	Any string value or empty(null)	
+library_strandedness	Conditional Required	Indicate the library strandedness	UNSTRANDED, FIRST_READ_SENSE_STRAND, FIRST_READ_ANTISENSE_STRAND, or empty(null)	Required for RNA-Seq
+rin	Optional	A numerical assessment of the integrity of RNA based on the entire electrophoretic trace of the RNA sample including the presence or absence of degradation products.	A number between 1 to 10 or empty(null)	Optional for RNA-Seq
+dv200	Optional	The percentage of RNA fragments that are >200 nucleotides in size	A percentage or empty(null)	Optional for RNA-Seq
diff --git a/metadata_dictionary/files_dict.tsv b/metadata_dictionary/files_dict.tsv
@@ -1,10 +1,10 @@
 Field	Attribute	Description	Permissible Values	Note
 type	Required	table type	sequencing_file	
 name	Required	Name of the file.	String values must meet the regular expression ^[A-Za-z0-9_\\.\\-\\[\\]\\(\\)]+$. No paths are allowed in the file name.	
-format	Required	Data format of sequencing files.	BAM, FASTQ	
+format	Required	Data format of sequencing files.	BAM, FASTQ,CRAM	
 size	Required	Size of the file, in bytes.		
-md5sum	Required	Computed md5sum of the file. 	String values must meet the regular expression ^[a-fA-F0-9]{32}$	
-path	Required	The path to the file to be submitted		Required for local data and data downloaded from EGA through Aspera
+md5sum	Required	Computed md5sum of the file.	String values must meet the regular expression ^[a-fA-F0-9]{32}$	
+path	Conditional Required	The path to the file to be submitted		Required for local data (use the file path relative to the directory you run the workflow) and data downloaded from EGA through Aspera (use the file path relative to Aspera root directory)
 ega_file_id	Conditional Required	EGA File Unique Accession ID	^EGAF[0-9]{1,32}$	Required for data downloaded from EGA
 ega_dataset_id	Optional	EGA Dataset Accession ID	^EGAD[0-9]{1,32}$	
 ega_experiment_id	Optional	EGA Experiment ID	^EGAX[0-9]{1,32}$	

diff --git a/metadata_dictionary/read_groups_dict.tsv b/metadata_dictionary/read_groups_dict.tsv
@@ -2,13 +2,13 @@ Field	Attribute	Description	Permissible Values	Note
 type	Required	type of the table	read_group	
 submitter_sequencing_experiment_id	Required	Unique identifier of the sequencing experiment, assigned by the data provider.	String values that meet the regular expression ^[a-zA-Z0-9]{1}[a-zA-Z0-9\\-_\\.:']{0,98}[a-zA-Z0-9]{1}$	
 submitter_read_group_id	Required	The identifier of a read group. Must be unique within each sample. After submission, the submitter_read_group_id in the metadata will be used for all future @RG ID in the header.	String values that meet the regular expression ^[a-zA-Z0-9\\-_:\\.']+$.	
-read_group_id_in_bam	Required	Optional field indicating the @RD ID in the BAM header and RG:Z in BAM body. If submitted, this will be used to map the @RG ID in the BAM header to the submitter_read_group_id in the payload. This cannot be submitted for FASTQ files.	String value must meet the regular expression ^[a-zA-Z0-9\\-_:\\.']+$ or null.	
-platform_unit	Required	Unique identifier including the {FLOWCELL_BARCODE}.{LANE}.{SAMPLE_BARCODE}. The {FLOWCELL_BARCODE} refers to the unique identifier for a particular flow cell. The {LANE} indicates the lane of the flow cell and the {SAMPLE_BARCODE} is a sample/library-specific identifier. For non-multiplex sequencing, platform unit and read group have a one-to-one relationship.	Any string value.	
+read_group_id_in_bam	Conditional Required	Conditional required field indicating the @RD ID in the BAM header and RG:Z in BAM body. If submitted, this will be used to map the @RG ID in the BAM header to the submitter_read_group_id in the payload.	String value must meet the regular expression ^[a-zA-Z0-9\\-_:\\.']+$ or null.	Required for BAM/CRAM files. This can NOT be submitted for FASTQ files.
+platform_unit	Required	Unique identifier indicates a particular flow cell, lane and sample/library-specific identifier. For non-multiplex sequencing, platform unit and read group have a one-to-one relationship.	Any string value.	
 is_paired_end	Required	Indicate if paired-end sequencing was performed.	true, false	
-file_r1	Required	Name of the sequencing file containing reads from the first end of a sequencing run.	Any string value. Must match a fileName identified in the files section.	
-file_r2	Required	Name of the sequencing file containing reads from the second end of a paired-end sequencing run. Required if and only if paired-end sequencing was done.	Any string value or null. Must match a fileName identified in the files section.	
-read_length_r1	Required	Length of sequencing reads in file_r1; this corresponds to the number of sequencing cycles of the first end.	Integer with a minimum value of 20 or empty(null).	
-read_length_r2	Required	Length of sequencing reads in file_r2; this corresponds to the number of sequencing cycles of the second end.	Integer with a minimum value of 20 or empty(null).	
-insert_size	Required	For paired-end sequencing, the average size of sequences between two sequencing ends. Required only for paired-end sequencing.	Integer with a minimum value of 0 or empty(null).	
-sample_barcode	Required	According to the SAM specification, this is the expected barcode bases as read by the sequencing machine in the absence of errors.	Any string value or empty(null). 	
-library_name	Required	Name of a sequencing library made from a molecular sample or a sample pool (multiplex sequencing). Must also be specified within Bam header LB.	Any string value.	
+file_r1	Required	Name of the sequencing file containing reads from the first end of a sequencing run.	Any string value.	Must match a fileName identified in the files section.
+file_r2	Conditional Required	Name of the sequencing file containing reads from the second end of a paired-end sequencing run. Required if and only if paired-end sequencing was done.	Any string value or null.	Must match a fileName identified in the files section.
+library_name	Required	Name of a sequencing library.	Any string value.	
+read_length_r1	Optional	Length of sequencing reads in file_r1; this corresponds to the number of sequencing cycles of the first end.	Integer with a minimum value of 20 or empty(null).	
+read_length_r2	Optional	Length of sequencing reads in file_r2; this corresponds to the number of sequencing cycles of the second end.	Integer with a minimum value of 20 or empty(null).	
+insert_size	Optional	For paired-end sequencing, the average size of sequences between two sequencing ends. Required only for paired-end sequencing.	Integer with a minimum value of 0 or empty(null).	
+sample_barcode	Optional	According to the SAM specification, this is the expected barcode bases as read by the sequencing machine in the absence of errors.	Any string value or empty(null).	
diff --git a/metadata_dictionary/template/experiment.tsv b/metadata_dictionary/template/experiment.tsv
@@ -1,2 +1,2 @@
-type	submitter_sequencing_experiment_id	program_id	submitter_donor_id	submitter_specimen_id	submitter_sample_id	submitter_matched_normal_sample_id	read_group_count	platform	experimental_strategy	sequencing_date	platform_model	sequencing_center	target_capture_kit	library_isolation_protocol	library_preparation_kit	library_strandedness	rin	dv200	spike_ins_included	spike_ins_fasta	spike_ins_concentration
-sequencing_experiment																					
+type	submitter_sequencing_experiment_id	program_id	submitter_donor_id	submitter_specimen_id	submitter_sample_id	submitter_matched_normal_sample_id	read_group_count	platform	experimental_strategy	sequencing_date	platform_model	sequencing_center	target_capture_kit	primary_target_regions	capture_target_regions	number_of_genes	gene_padding	coverage	library_selection	library_preparation_kit	library_strandedness	rin	dv200
+sequencing_experiment																							
diff --git a/metadata_dictionary/template/files.tsv b/metadata_dictionary/template/files.tsv
@@ -1,2 +1,2 @@
 type	name	format	size	md5sum	path	ega_file_id	ega_dataset_id	ega_experiment_id	ega_sample_id	ega_study_id	ega_run_id	ega_policy_id	ega_analysis_id	ega_submission_id	ega_dac_id
-file															
+sequencing_file															
diff --git a/metadata_dictionary/template/read_groups.tsv b/metadata_dictionary/template/read_groups.tsv
@@ -1,2 +1,2 @@
-type	submitter_sequencing_experiment_id	submitter_read_group_id	read_group_id_in_bam	platform_unit	is_paired_end	file_r1	file_r2	read_length_r1	read_length_r2	insert_size	sample_barcode	library_name
+type	submitter_sequencing_experiment_id	submitter_read_group_id	read_group_id_in_bam	platform_unit	is_paired_end	file_r1	file_r2	library_name	read_length_r1	read_length_r2	insert_size	sample_barcode
 read_group