From a82e2f139d09d853125cb0190d9b22e41446c425 Mon Sep 17 00:00:00 2001 From: Stephen Kelly Date: Tue, 25 Jan 2022 17:45:57 -0500 Subject: [PATCH 1/4] refactor samples fillout workflow to use array of sample records --- cwl/samples_fillout_workflow.cwl | 74 ++++++++++++++++------ tests/test_samples_fillout_workflow_cwl.py | 33 ++++++---- 2 files changed, 77 insertions(+), 30 deletions(-) diff --git a/cwl/samples_fillout_workflow.cwl b/cwl/samples_fillout_workflow.cwl index a4304c1..7793d8b 100644 --- a/cwl/samples_fillout_workflow.cwl +++ b/cwl/samples_fillout_workflow.cwl @@ -13,24 +13,24 @@ requirements: inputs: # NOTE: arrays for sample_ids, bam_files, maf_files must all be the same length and in the same order by sample - + samples: + type: + type: array + items: + type: record + fields: + maf_file: File + sample_id: string # must match sample ID used inside maf file + normal_id: string output_fname: type: [ 'null', string ] default: "output.maf" - sample_ids: - type: - type: array - items: string bam_files: type: type: array items: File secondaryFiles: - ^.bai - maf_files: - type: - type: array - items: File ref_fasta: type: File secondaryFiles: @@ -54,14 +54,42 @@ inputs: # path: /juno/work/ci/resources/vep/cache steps: + # create a list of just sample_ids out of the samples record array + create_samples_list: + in: + samples: samples + out: [ sample_ids ] + run: + class: ExpressionTool + inputs: + samples: + type: + type: array + items: + type: record + fields: + sample_id: string + outputs: + sample_ids: string[] + expression: "${ + var sample_ids = []; + for ( var i in inputs.samples ){ + sample_ids.push(inputs.samples[i]['sample_id']); + }; + return {'sample_ids': sample_ids}; + }" + + # convert all maf input files back to vcf because they are much easier to manipulate that way # NOTE: This is important; do NOT try to do these complex manipulations on maf format file maf2vcf: - scatter: [ sample_id, maf_file ] - scatterMethod: dotproduct + scatter: sample in: - sample_id: sample_ids - maf_file: maf_files + sample: samples + sample_id: + valueFrom: ${ return inputs.sample['sample_id']; } + maf_file: + valueFrom: ${ return inputs.sample['maf_file']; } ref_fasta: ref_fasta out: [ output_file ] @@ -116,7 +144,7 @@ steps: # this will be used as the target regions for fillout merge_vcfs: in: - sample_ids: sample_ids + sample_ids: create_samples_list/sample_ids vcf_gz_files: maf2vcf/output_file out: [ merged_vcf, merged_vcf_gz ] @@ -163,9 +191,11 @@ steps: - .tbi # run GetBaseCountsMultiSample on all the bam files against the target regions (the merged vcf from all samples) + # TODO: convert this to a `scatter` step that runs per-sample in parallel, then merge the outputs + # otherwise we will hit the command line arg length issues gbcms: in: - sample_ids: sample_ids + sample_ids: create_samples_list/sample_ids bam_files: bam_files targets_vcf: merge_vcfs/merged_vcf ref_fasta: ref_fasta @@ -227,7 +257,6 @@ steps: # also we are going to add a column called SRC telling the source (which sample) each variant was originally found in fix_labels_and_merge_vcfs: in: - sample_ids: sample_ids fillout_vcf: gbcms/output_file merged_vcf: merge_vcfs/merged_vcf merged_vcf_gz: merge_vcfs/merged_vcf_gz @@ -316,9 +345,13 @@ steps: # next we need to split apart the merged fillout vcf back into individual sample maf files split_vcf_to_mafs: - scatter: [ sample_id ] + scatter: sample in: - sample_id: sample_ids + sample: samples + sample_id: + valueFrom: ${ return inputs.sample['sample_id']; } + normal_id: + valueFrom: ${ return inputs.sample['normal_id']; } fillout_vcf: fix_labels_and_merge_vcfs/fillout_sources_vcf ref_fasta: ref_fasta exac_filter: exac_filter @@ -338,6 +371,7 @@ steps: set -eu # convert the multi-sample annotated fillout vcf back into individual sample maf files sample_id="${ return inputs.sample_id ; }" + normal_id="${ return inputs.normal_id ; }" ref_fasta="${ return inputs.ref_fasta.path ; }" input_vcf="${ return inputs.fillout_vcf.path ; }" exac_filter="${ return inputs.exac_filter.path ; }" @@ -364,9 +398,11 @@ steps: --retain-fmt GT,FL_AD,FL_ADN,FL_ADP,FL_DP,FL_DPN,FL_DPP,FL_RD,FL_RDN,FL_RDP,FL_VF,AD,DP \\ --vep-forks 8 \\ --vcf-tumor-id "\${sample_id}" \\ - --tumor-id "\${sample_id}" + --tumor-id "\${sample_id}" \\ + --normal-id "\${normal_id}" inputs: sample_id: string + normal_id: string ref_fasta: type: File secondaryFiles: diff --git a/tests/test_samples_fillout_workflow_cwl.py b/tests/test_samples_fillout_workflow_cwl.py index 467ab3e..8475f0a 100644 --- a/tests/test_samples_fillout_workflow_cwl.py +++ b/tests/test_samples_fillout_workflow_cwl.py @@ -117,17 +117,28 @@ def test_run_fillout_workflow(self): Test case for running the fillout workflow on a number of samples, each with a bam and maf """ self.maxDiff = None + # self.runner_args['debug'] = True + # self.runner_args['js_console'] = True + # self.preserve = True + # print(self.tmpdir) self.input = { + "samples": [ + { + "sample_id": "Sample24", + "normal_id": "Sample24-N", + "maf_file": { "class": "File", "path": self.maf1 } + }, + { + "sample_id": "Sample23", + "normal_id": "Sample23-N", + "maf_file": { "class": "File", "path": self.maf2 } + }, + ], "ref_fasta": {"class": "File", "path": self.DATA_SETS['Proj_08390_G']['REF_FASTA']}, - "sample_ids": ["Sample24", "Sample23"], "bam_files": [ { "class": "File", "path": os.path.join(self.DATA_SETS['Proj_08390_G']['BAM_DIR'], "Sample24.rg.md.abra.printreads.bam") }, { "class": "File", "path": os.path.join(self.DATA_SETS['Proj_08390_G']['BAM_DIR'], "Sample23.rg.md.abra.printreads.bam") } - ], - "maf_files": [ - { "class": "File", "path": self.maf1 }, - { "class": "File", "path": self.maf2 } ] } @@ -138,8 +149,8 @@ def test_run_fillout_workflow(self): 'location': 'file://' + os.path.join(output_dir,'output.maf'), 'basename': 'output.maf', 'class': 'File', - 'checksum': 'sha1$45719b33d5de77f789b43a2f66f1bf0a5d039cbd', - 'size': 7992, + 'checksum': 'sha1$7932ae9938a5686f6328f143a6c82308877cb822', + 'size': 8008, 'path': os.path.join(output_dir,'output.maf') } } @@ -154,13 +165,13 @@ def test_run_fillout_workflow(self): self.assertTrue(len(records) == 4) expected_records = [ - {'Hugo_Symbol': 'KMT2C', 'Entrez_Gene_Id': '58508', 'Center': '.', 'NCBI_Build': 'GRCh37', 'Chromosome': '7', 'Start_Position': '151845367', 'End_Position': '151845367', 'Strand': '+', 'Variant_Classification': 'Missense_Mutation', 'Variant_Type': 'SNP', 'Reference_Allele': 'G', 'Tumor_Seq_Allele1': 'G', 'Tumor_Seq_Allele2': 'A', 'dbSNP_RS': '', 'dbSNP_Val_Status': '', 'Tumor_Sample_Barcode': 'Sample24', 'Matched_Norm_Sample_Barcode': 'NORMAL', 'Match_Norm_Seq_Allele1': 'G', 'Match_Norm_Seq_Allele2': 'G', 'Tumor_Validation_Allele1': '', 'Tumor_Validation_Allele2': '', 'Match_Norm_Validation_Allele1': '', 'Match_Norm_Validation_Allele2': '', 'Verification_Status': '', 'Validation_Status': '', 'Mutation_Status': '', 'Sequencing_Phase': '', 'Sequence_Source': '', 'Validation_Method': '', 'Score': '', 'BAM_File': '', 'Sequencer': '', 'Tumor_Sample_UUID': '', 'Matched_Norm_Sample_UUID': '', 'HGVSc': 'c.13645C>T', 'HGVSp': 'p.Arg4549Cys', 'HGVSp_Short': 'p.R4549C', 'Transcript_ID': 'ENST00000262189', 'Exon_Number': '52/59', 't_depth': '72', 't_ref_count': '68', 't_alt_count': '4', 'n_depth': '', 'n_ref_count': '', 'n_alt_count': '', 'all_effects': 'KMT2C,missense_variant,p.Arg4606Cys,ENST00000355193,;KMT2C,missense_variant,p.Arg4549Cys,ENST00000262189,NM_170606.2;KMT2C,missense_variant,p.Arg2110Cys,ENST00000360104,;KMT2C,missense_variant,p.Arg1166Cys,ENST00000424877,;KMT2C,downstream_gene_variant,,ENST00000418061,;KMT2C,downstream_gene_variant,,ENST00000485241,;KMT2C,3_prime_UTR_variant,,ENST00000558084,;KMT2C,non_coding_transcript_exon_variant,,ENST00000473186,;', 'Allele': 'A', 'Gene': 'ENSG00000055609', 'Feature': 'ENST00000262189', 'Feature_type': 'Transcript', 'Consequence': 'missense_variant', 'cDNA_position': '13864/16862', 'CDS_position': '13645/14736', 'Protein_position': '4549/4911', 'Amino_acids': 'R/C', 'Codons': 'Cgc/Tgc', 'Existing_variation': 'COSM245709,COSM245710', 'ALLELE_NUM': '1', 'DISTANCE': '', 'STRAND_VEP': '-1', 'SYMBOL': 'KMT2C', 'SYMBOL_SOURCE': 'HGNC', 'HGNC_ID': '13726', 'BIOTYPE': 'protein_coding', 'CANONICAL': 'YES', 'CCDS': 'CCDS5931.1', 'ENSP': 'ENSP00000262189', 'SWISSPROT': 'Q8NEZ4', 'TREMBL': 'Q6N019,Q75MN6,H0YMU7', 'UNIPARC': 'UPI0000141B9F', 'RefSeq': 'NM_170606.2', 'SIFT': '', 'PolyPhen': 'probably_damaging(0.999)', 'EXON': '52/59', 'INTRON': '', 'DOMAINS': 'PROSITE_profiles:PS51542,hmmpanther:PTHR22884,hmmpanther:PTHR22884:SF305', 'AF': '', 'AFR_AF': '', 'AMR_AF': '', 'ASN_AF': '', 'EAS_AF': '', 'EUR_AF': '', 'SAS_AF': '', 'AA_AF': '', 'EA_AF': '', 'CLIN_SIG': '', 'SOMATIC': '1,1', 'PUBMED': '', 'MOTIF_NAME': '', 'MOTIF_POS': '', 'HIGH_INF_POS': '', 'MOTIF_SCORE_CHANGE': '', 'IMPACT': 'MODERATE', 'PICK': '1', 'VARIANT_CLASS': 'SNV', 'TSL': '', 'HGVS_OFFSET': '', 'PHENO': '1,1', 'MINIMISED': '', 'ExAC_AF': '', 'ExAC_AF_AFR': '', 'ExAC_AF_AMR': '', 'ExAC_AF_EAS': '', 'ExAC_AF_FIN': '', 'ExAC_AF_NFE': '', 'ExAC_AF_OTH': '', 'ExAC_AF_SAS': '', 'GENE_PHENO': '', 'FILTER': '.', 'flanking_bps': 'CGA', 'vcf_id': '.', 'vcf_qual': '.', 'ExAC_AF_Adj': '', 'ExAC_AC_AN_Adj': '', 'ExAC_AC_AN': '', 'ExAC_AC_AN_AFR': '', 'ExAC_AC_AN_AMR': '', 'ExAC_AC_AN_EAS': '', 'ExAC_AC_AN_FIN': '', 'ExAC_AC_AN_NFE': '', 'ExAC_AC_AN_OTH': '', 'ExAC_AC_AN_SAS': '', 'ExAC_FILTER': '', 'gnomAD_AF': '', 'gnomAD_AFR_AF': '', 'gnomAD_AMR_AF': '', 'gnomAD_ASJ_AF': '', 'gnomAD_EAS_AF': '', 'gnomAD_FIN_AF': '', 'gnomAD_NFE_AF': '', 'gnomAD_OTH_AF': '', 'gnomAD_SAS_AF': '', 'vcf_pos': '151845367', 'AC': '1', 'AN': '2', 'SRC': 'Sample23,', 't_GT': './.', 'n_GT': '', 't_FL_AD': '4', 'n_FL_AD': '', 't_FL_ADN': '2', 'n_FL_ADN': '', 't_FL_ADP': '2', 'n_FL_ADP': '', 't_FL_DP': '72', 'n_FL_DP': '', 't_FL_DPN': '38', 'n_FL_DPN': '', 't_FL_DPP': '34', 'n_FL_DPP': '', 't_FL_RD': '68', 'n_FL_RD': '', 't_FL_RDN': '36', 'n_FL_RDN': '', 't_FL_RDP': '32', 'n_FL_RDP': '', 't_FL_VF': '0.0555556', 'n_FL_VF': '', 't_AD': '', 'n_AD': '', 't_DP': '.', 'n_DP': '', 't_depth_sample': '', 't_ref_count_sample': '', 't_alt_count_sample': '', 'is_fillout': 'True'}, + {'Hugo_Symbol': 'KMT2C', 'Entrez_Gene_Id': '58508', 'Center': '.', 'NCBI_Build': 'GRCh37', 'Chromosome': '7', 'Start_Position': '151845367', 'End_Position': '151845367', 'Strand': '+', 'Variant_Classification': 'Missense_Mutation', 'Variant_Type': 'SNP', 'Reference_Allele': 'G', 'Tumor_Seq_Allele1': 'G', 'Tumor_Seq_Allele2': 'A', 'dbSNP_RS': '', 'dbSNP_Val_Status': '', 'Tumor_Sample_Barcode': 'Sample24', 'Matched_Norm_Sample_Barcode': 'Sample24-N', 'Match_Norm_Seq_Allele1': 'G', 'Match_Norm_Seq_Allele2': 'G', 'Tumor_Validation_Allele1': '', 'Tumor_Validation_Allele2': '', 'Match_Norm_Validation_Allele1': '', 'Match_Norm_Validation_Allele2': '', 'Verification_Status': '', 'Validation_Status': '', 'Mutation_Status': '', 'Sequencing_Phase': '', 'Sequence_Source': '', 'Validation_Method': '', 'Score': '', 'BAM_File': '', 'Sequencer': '', 'Tumor_Sample_UUID': '', 'Matched_Norm_Sample_UUID': '', 'HGVSc': 'c.13645C>T', 'HGVSp': 'p.Arg4549Cys', 'HGVSp_Short': 'p.R4549C', 'Transcript_ID': 'ENST00000262189', 'Exon_Number': '52/59', 't_depth': '72', 't_ref_count': '68', 't_alt_count': '4', 'n_depth': '', 'n_ref_count': '', 'n_alt_count': '', 'all_effects': 'KMT2C,missense_variant,p.Arg4606Cys,ENST00000355193,;KMT2C,missense_variant,p.Arg4549Cys,ENST00000262189,NM_170606.2;KMT2C,missense_variant,p.Arg2110Cys,ENST00000360104,;KMT2C,missense_variant,p.Arg1166Cys,ENST00000424877,;KMT2C,downstream_gene_variant,,ENST00000418061,;KMT2C,downstream_gene_variant,,ENST00000485241,;KMT2C,3_prime_UTR_variant,,ENST00000558084,;KMT2C,non_coding_transcript_exon_variant,,ENST00000473186,;', 'Allele': 'A', 'Gene': 'ENSG00000055609', 'Feature': 'ENST00000262189', 'Feature_type': 'Transcript', 'Consequence': 'missense_variant', 'cDNA_position': '13864/16862', 'CDS_position': '13645/14736', 'Protein_position': '4549/4911', 'Amino_acids': 'R/C', 'Codons': 'Cgc/Tgc', 'Existing_variation': 'COSM245709,COSM245710', 'ALLELE_NUM': '1', 'DISTANCE': '', 'STRAND_VEP': '-1', 'SYMBOL': 'KMT2C', 'SYMBOL_SOURCE': 'HGNC', 'HGNC_ID': '13726', 'BIOTYPE': 'protein_coding', 'CANONICAL': 'YES', 'CCDS': 'CCDS5931.1', 'ENSP': 'ENSP00000262189', 'SWISSPROT': 'Q8NEZ4', 'TREMBL': 'Q6N019,Q75MN6,H0YMU7', 'UNIPARC': 'UPI0000141B9F', 'RefSeq': 'NM_170606.2', 'SIFT': '', 'PolyPhen': 'probably_damaging(0.999)', 'EXON': '52/59', 'INTRON': '', 'DOMAINS': 'PROSITE_profiles:PS51542,hmmpanther:PTHR22884,hmmpanther:PTHR22884:SF305', 'AF': '', 'AFR_AF': '', 'AMR_AF': '', 'ASN_AF': '', 'EAS_AF': '', 'EUR_AF': '', 'SAS_AF': '', 'AA_AF': '', 'EA_AF': '', 'CLIN_SIG': '', 'SOMATIC': '1,1', 'PUBMED': '', 'MOTIF_NAME': '', 'MOTIF_POS': '', 'HIGH_INF_POS': '', 'MOTIF_SCORE_CHANGE': '', 'IMPACT': 'MODERATE', 'PICK': '1', 'VARIANT_CLASS': 'SNV', 'TSL': '', 'HGVS_OFFSET': '', 'PHENO': '1,1', 'MINIMISED': '', 'ExAC_AF': '', 'ExAC_AF_AFR': '', 'ExAC_AF_AMR': '', 'ExAC_AF_EAS': '', 'ExAC_AF_FIN': '', 'ExAC_AF_NFE': '', 'ExAC_AF_OTH': '', 'ExAC_AF_SAS': '', 'GENE_PHENO': '', 'FILTER': '.', 'flanking_bps': 'CGA', 'vcf_id': '.', 'vcf_qual': '.', 'ExAC_AF_Adj': '', 'ExAC_AC_AN_Adj': '', 'ExAC_AC_AN': '', 'ExAC_AC_AN_AFR': '', 'ExAC_AC_AN_AMR': '', 'ExAC_AC_AN_EAS': '', 'ExAC_AC_AN_FIN': '', 'ExAC_AC_AN_NFE': '', 'ExAC_AC_AN_OTH': '', 'ExAC_AC_AN_SAS': '', 'ExAC_FILTER': '', 'gnomAD_AF': '', 'gnomAD_AFR_AF': '', 'gnomAD_AMR_AF': '', 'gnomAD_ASJ_AF': '', 'gnomAD_EAS_AF': '', 'gnomAD_FIN_AF': '', 'gnomAD_NFE_AF': '', 'gnomAD_OTH_AF': '', 'gnomAD_SAS_AF': '', 'vcf_pos': '151845367', 'AC': '1', 'AN': '2', 'SRC': 'Sample23,', 't_GT': './.', 'n_GT': '', 't_FL_AD': '4', 'n_FL_AD': '', 't_FL_ADN': '2', 'n_FL_ADN': '', 't_FL_ADP': '2', 'n_FL_ADP': '', 't_FL_DP': '72', 'n_FL_DP': '', 't_FL_DPN': '38', 'n_FL_DPN': '', 't_FL_DPP': '34', 'n_FL_DPP': '', 't_FL_RD': '68', 'n_FL_RD': '', 't_FL_RDN': '36', 'n_FL_RDN': '', 't_FL_RDP': '32', 'n_FL_RDP': '', 't_FL_VF': '0.0555556', 'n_FL_VF': '', 't_AD': '', 'n_AD': '', 't_DP': '.', 'n_DP': '', 't_depth_sample': '', 't_ref_count_sample': '', 't_alt_count_sample': '', 'is_fillout': 'True'}, - {'Hugo_Symbol': 'RTEL1', 'Entrez_Gene_Id': '51750', 'Center': '.', 'NCBI_Build': 'GRCh37', 'Chromosome': '20', 'Start_Position': '62321135', 'End_Position': '62321135', 'Strand': '+', 'Variant_Classification': 'Silent', 'Variant_Type': 'SNP', 'Reference_Allele': 'G', 'Tumor_Seq_Allele1': 'G', 'Tumor_Seq_Allele2': 'A', 'dbSNP_RS': 'rs746824222', 'dbSNP_Val_Status': '', 'Tumor_Sample_Barcode': 'Sample24', 'Matched_Norm_Sample_Barcode': 'NORMAL', 'Match_Norm_Seq_Allele1': 'G', 'Match_Norm_Seq_Allele2': 'G', 'Tumor_Validation_Allele1': '', 'Tumor_Validation_Allele2': '', 'Match_Norm_Validation_Allele1': '', 'Match_Norm_Validation_Allele2': '', 'Verification_Status': '', 'Validation_Status': '', 'Mutation_Status': '', 'Sequencing_Phase': '', 'Sequence_Source': '', 'Validation_Method': '', 'Score': '', 'BAM_File': '', 'Sequencer': '', 'Tumor_Sample_UUID': '', 'Matched_Norm_Sample_UUID': '', 'HGVSc': 'c.2130G>A', 'HGVSp': 'p.=', 'HGVSp_Short': 'p.Q710=', 'Transcript_ID': 'ENST00000508582', 'Exon_Number': '24/35', 't_depth': '653', 't_ref_count': '511', 't_alt_count': '142', 'n_depth': '', 'n_ref_count': '', 'n_alt_count': '', 'all_effects': 'RTEL1,synonymous_variant,p.=,ENST00000318100,;RTEL1,synonymous_variant,p.=,ENST00000370018,NM_032957.4,NM_016434.3;RTEL1,synonymous_variant,p.=,ENST00000360203,NM_001283009.1;RTEL1,synonymous_variant,p.=,ENST00000508582,;RTEL1,synonymous_variant,p.=,ENST00000425905,;RTEL1,upstream_gene_variant,,ENST00000370003,;RTEL1-TNFRSF6B,synonymous_variant,p.=,ENST00000482936,;RTEL1-TNFRSF6B,synonymous_variant,p.=,ENST00000492259,;RTEL1-TNFRSF6B,non_coding_transcript_exon_variant,,ENST00000480273,;RTEL1-TNFRSF6B,non_coding_transcript_exon_variant,,ENST00000496281,;RTEL1,upstream_gene_variant,,ENST00000496816,;', 'Allele': 'A', 'Gene': 'ENSG00000258366', 'Feature': 'ENST00000508582', 'Feature_type': 'Transcript', 'Consequence': 'synonymous_variant', 'cDNA_position': '2476/4273', 'CDS_position': '2130/3732', 'Protein_position': '710/1243', 'Amino_acids': 'Q', 'Codons': 'caG/caA', 'Existing_variation': 'rs746824222', 'ALLELE_NUM': '1', 'DISTANCE': '', 'STRAND_VEP': '1', 'SYMBOL': 'RTEL1', 'SYMBOL_SOURCE': 'HGNC', 'HGNC_ID': '15888', 'BIOTYPE': 'protein_coding', 'CANONICAL': 'YES', 'CCDS': 'CCDS13530.3', 'ENSP': 'ENSP00000424307', 'SWISSPROT': 'Q9NZ71', 'TREMBL': '', 'UNIPARC': 'UPI00019B2219', 'RefSeq': '', 'SIFT': '', 'PolyPhen': '', 'EXON': '24/35', 'INTRON': '', 'DOMAINS': 'Superfamily_domains:SSF52540,SMART_domains:SM00491,Pfam_domain:PF13307,TIGRFAM_domain:TIGR00604,hmmpanther:PTHR11472:SF4,hmmpanther:PTHR11472', 'AF': '', 'AFR_AF': '', 'AMR_AF': '', 'ASN_AF': '', 'EAS_AF': '', 'EUR_AF': '', 'SAS_AF': '', 'AA_AF': '', 'EA_AF': '', 'CLIN_SIG': '', 'SOMATIC': '', 'PUBMED': '', 'MOTIF_NAME': '', 'MOTIF_POS': '', 'HIGH_INF_POS': '', 'MOTIF_SCORE_CHANGE': '', 'IMPACT': 'LOW', 'PICK': '1', 'VARIANT_CLASS': 'SNV', 'TSL': '', 'HGVS_OFFSET': '', 'PHENO': '', 'MINIMISED': '', 'ExAC_AF': '', 'ExAC_AF_AFR': '', 'ExAC_AF_AMR': '', 'ExAC_AF_EAS': '', 'ExAC_AF_FIN': '', 'ExAC_AF_NFE': '', 'ExAC_AF_OTH': '', 'ExAC_AF_SAS': '', 'GENE_PHENO': '1', 'FILTER': '.', 'flanking_bps': 'AGG', 'vcf_id': '.', 'vcf_qual': '.', 'ExAC_AF_Adj': '', 'ExAC_AC_AN_Adj': '', 'ExAC_AC_AN': '', 'ExAC_AC_AN_AFR': '', 'ExAC_AC_AN_AMR': '', 'ExAC_AC_AN_EAS': '', 'ExAC_AC_AN_FIN': '', 'ExAC_AC_AN_NFE': '', 'ExAC_AC_AN_OTH': '', 'ExAC_AC_AN_SAS': '', 'ExAC_FILTER': '', 'gnomAD_AF': '', 'gnomAD_AFR_AF': '', 'gnomAD_AMR_AF': '', 'gnomAD_ASJ_AF': '', 'gnomAD_EAS_AF': '', 'gnomAD_FIN_AF': '', 'gnomAD_NFE_AF': '', 'gnomAD_OTH_AF': '', 'gnomAD_SAS_AF': '', 'vcf_pos': '62321135', 'AC': '1', 'AN': '2', 'SRC': 'Sample24,', 't_GT': '0/1', 'n_GT': '', 't_FL_AD': '0', 'n_FL_AD': '', 't_FL_ADN': '0', 'n_FL_ADN': '', 't_FL_ADP': '0', 'n_FL_ADP': '', 't_FL_DP': '129', 'n_FL_DP': '', 't_FL_DPN': '63', 'n_FL_DPN': '', 't_FL_DPP': '66', 'n_FL_DPP': '', 't_FL_RD': '129', 'n_FL_RD': '', 't_FL_RDN': '63', 'n_FL_RDN': '', 't_FL_RDP': '66', 'n_FL_RDP': '', 't_FL_VF': '0', 'n_FL_VF': '', 't_AD': '511,142', 'n_AD': '', 't_DP': '653', 'n_DP': '', 't_depth_sample': '653', 't_ref_count_sample': '511', 't_alt_count_sample': '142', 'is_fillout': 'False'}, + {'Hugo_Symbol': 'RTEL1', 'Entrez_Gene_Id': '51750', 'Center': '.', 'NCBI_Build': 'GRCh37', 'Chromosome': '20', 'Start_Position': '62321135', 'End_Position': '62321135', 'Strand': '+', 'Variant_Classification': 'Silent', 'Variant_Type': 'SNP', 'Reference_Allele': 'G', 'Tumor_Seq_Allele1': 'G', 'Tumor_Seq_Allele2': 'A', 'dbSNP_RS': 'rs746824222', 'dbSNP_Val_Status': '', 'Tumor_Sample_Barcode': 'Sample24', 'Matched_Norm_Sample_Barcode': 'Sample24-N', 'Match_Norm_Seq_Allele1': 'G', 'Match_Norm_Seq_Allele2': 'G', 'Tumor_Validation_Allele1': '', 'Tumor_Validation_Allele2': '', 'Match_Norm_Validation_Allele1': '', 'Match_Norm_Validation_Allele2': '', 'Verification_Status': '', 'Validation_Status': '', 'Mutation_Status': '', 'Sequencing_Phase': '', 'Sequence_Source': '', 'Validation_Method': '', 'Score': '', 'BAM_File': '', 'Sequencer': '', 'Tumor_Sample_UUID': '', 'Matched_Norm_Sample_UUID': '', 'HGVSc': 'c.2130G>A', 'HGVSp': 'p.=', 'HGVSp_Short': 'p.Q710=', 'Transcript_ID': 'ENST00000508582', 'Exon_Number': '24/35', 't_depth': '653', 't_ref_count': '511', 't_alt_count': '142', 'n_depth': '', 'n_ref_count': '', 'n_alt_count': '', 'all_effects': 'RTEL1,synonymous_variant,p.=,ENST00000318100,;RTEL1,synonymous_variant,p.=,ENST00000370018,NM_032957.4,NM_016434.3;RTEL1,synonymous_variant,p.=,ENST00000360203,NM_001283009.1;RTEL1,synonymous_variant,p.=,ENST00000508582,;RTEL1,synonymous_variant,p.=,ENST00000425905,;RTEL1,upstream_gene_variant,,ENST00000370003,;RTEL1-TNFRSF6B,synonymous_variant,p.=,ENST00000482936,;RTEL1-TNFRSF6B,synonymous_variant,p.=,ENST00000492259,;RTEL1-TNFRSF6B,non_coding_transcript_exon_variant,,ENST00000480273,;RTEL1-TNFRSF6B,non_coding_transcript_exon_variant,,ENST00000496281,;RTEL1,upstream_gene_variant,,ENST00000496816,;', 'Allele': 'A', 'Gene': 'ENSG00000258366', 'Feature': 'ENST00000508582', 'Feature_type': 'Transcript', 'Consequence': 'synonymous_variant', 'cDNA_position': '2476/4273', 'CDS_position': '2130/3732', 'Protein_position': '710/1243', 'Amino_acids': 'Q', 'Codons': 'caG/caA', 'Existing_variation': 'rs746824222', 'ALLELE_NUM': '1', 'DISTANCE': '', 'STRAND_VEP': '1', 'SYMBOL': 'RTEL1', 'SYMBOL_SOURCE': 'HGNC', 'HGNC_ID': '15888', 'BIOTYPE': 'protein_coding', 'CANONICAL': 'YES', 'CCDS': 'CCDS13530.3', 'ENSP': 'ENSP00000424307', 'SWISSPROT': 'Q9NZ71', 'TREMBL': '', 'UNIPARC': 'UPI00019B2219', 'RefSeq': '', 'SIFT': '', 'PolyPhen': '', 'EXON': '24/35', 'INTRON': '', 'DOMAINS': 'Superfamily_domains:SSF52540,SMART_domains:SM00491,Pfam_domain:PF13307,TIGRFAM_domain:TIGR00604,hmmpanther:PTHR11472:SF4,hmmpanther:PTHR11472', 'AF': '', 'AFR_AF': '', 'AMR_AF': '', 'ASN_AF': '', 'EAS_AF': '', 'EUR_AF': '', 'SAS_AF': '', 'AA_AF': '', 'EA_AF': '', 'CLIN_SIG': '', 'SOMATIC': '', 'PUBMED': '', 'MOTIF_NAME': '', 'MOTIF_POS': '', 'HIGH_INF_POS': '', 'MOTIF_SCORE_CHANGE': '', 'IMPACT': 'LOW', 'PICK': '1', 'VARIANT_CLASS': 'SNV', 'TSL': '', 'HGVS_OFFSET': '', 'PHENO': '', 'MINIMISED': '', 'ExAC_AF': '', 'ExAC_AF_AFR': '', 'ExAC_AF_AMR': '', 'ExAC_AF_EAS': '', 'ExAC_AF_FIN': '', 'ExAC_AF_NFE': '', 'ExAC_AF_OTH': '', 'ExAC_AF_SAS': '', 'GENE_PHENO': '1', 'FILTER': '.', 'flanking_bps': 'AGG', 'vcf_id': '.', 'vcf_qual': '.', 'ExAC_AF_Adj': '', 'ExAC_AC_AN_Adj': '', 'ExAC_AC_AN': '', 'ExAC_AC_AN_AFR': '', 'ExAC_AC_AN_AMR': '', 'ExAC_AC_AN_EAS': '', 'ExAC_AC_AN_FIN': '', 'ExAC_AC_AN_NFE': '', 'ExAC_AC_AN_OTH': '', 'ExAC_AC_AN_SAS': '', 'ExAC_FILTER': '', 'gnomAD_AF': '', 'gnomAD_AFR_AF': '', 'gnomAD_AMR_AF': '', 'gnomAD_ASJ_AF': '', 'gnomAD_EAS_AF': '', 'gnomAD_FIN_AF': '', 'gnomAD_NFE_AF': '', 'gnomAD_OTH_AF': '', 'gnomAD_SAS_AF': '', 'vcf_pos': '62321135', 'AC': '1', 'AN': '2', 'SRC': 'Sample24,', 't_GT': '0/1', 'n_GT': '', 't_FL_AD': '0', 'n_FL_AD': '', 't_FL_ADN': '0', 'n_FL_ADN': '', 't_FL_ADP': '0', 'n_FL_ADP': '', 't_FL_DP': '129', 'n_FL_DP': '', 't_FL_DPN': '63', 'n_FL_DPN': '', 't_FL_DPP': '66', 'n_FL_DPP': '', 't_FL_RD': '129', 'n_FL_RD': '', 't_FL_RDN': '63', 'n_FL_RDN': '', 't_FL_RDP': '66', 'n_FL_RDP': '', 't_FL_VF': '0', 'n_FL_VF': '', 't_AD': '511,142', 'n_AD': '', 't_DP': '653', 'n_DP': '', 't_depth_sample': '653', 't_ref_count_sample': '511', 't_alt_count_sample': '142', 'is_fillout': 'False'}, - {'Hugo_Symbol': 'KMT2C', 'Entrez_Gene_Id': '58508', 'Center': '.', 'NCBI_Build': 'GRCh37', 'Chromosome': '7', 'Start_Position': '151845367', 'End_Position': '151845367', 'Strand': '+', 'Variant_Classification': 'Missense_Mutation', 'Variant_Type': 'SNP', 'Reference_Allele': 'G', 'Tumor_Seq_Allele1': 'G', 'Tumor_Seq_Allele2': 'A', 'dbSNP_RS': '', 'dbSNP_Val_Status': '', 'Tumor_Sample_Barcode': 'Sample23', 'Matched_Norm_Sample_Barcode': 'NORMAL', 'Match_Norm_Seq_Allele1': 'G', 'Match_Norm_Seq_Allele2': 'G', 'Tumor_Validation_Allele1': '', 'Tumor_Validation_Allele2': '', 'Match_Norm_Validation_Allele1': '', 'Match_Norm_Validation_Allele2': '', 'Verification_Status': '', 'Validation_Status': '', 'Mutation_Status': '', 'Sequencing_Phase': '', 'Sequence_Source': '', 'Validation_Method': '', 'Score': '', 'BAM_File': '', 'Sequencer': '', 'Tumor_Sample_UUID': '', 'Matched_Norm_Sample_UUID': '', 'HGVSc': 'c.13645C>T', 'HGVSp': 'p.Arg4549Cys', 'HGVSp_Short': 'p.R4549C', 'Transcript_ID': 'ENST00000262189', 'Exon_Number': '52/59', 't_depth': '653', 't_ref_count': '511', 't_alt_count': '142', 'n_depth': '', 'n_ref_count': '', 'n_alt_count': '', 'all_effects': 'KMT2C,missense_variant,p.Arg4606Cys,ENST00000355193,;KMT2C,missense_variant,p.Arg4549Cys,ENST00000262189,NM_170606.2;KMT2C,missense_variant,p.Arg2110Cys,ENST00000360104,;KMT2C,missense_variant,p.Arg1166Cys,ENST00000424877,;KMT2C,downstream_gene_variant,,ENST00000418061,;KMT2C,downstream_gene_variant,,ENST00000485241,;KMT2C,3_prime_UTR_variant,,ENST00000558084,;KMT2C,non_coding_transcript_exon_variant,,ENST00000473186,;', 'Allele': 'A', 'Gene': 'ENSG00000055609', 'Feature': 'ENST00000262189', 'Feature_type': 'Transcript', 'Consequence': 'missense_variant', 'cDNA_position': '13864/16862', 'CDS_position': '13645/14736', 'Protein_position': '4549/4911', 'Amino_acids': 'R/C', 'Codons': 'Cgc/Tgc', 'Existing_variation': 'COSM245709,COSM245710', 'ALLELE_NUM': '1', 'DISTANCE': '', 'STRAND_VEP': '-1', 'SYMBOL': 'KMT2C', 'SYMBOL_SOURCE': 'HGNC', 'HGNC_ID': '13726', 'BIOTYPE': 'protein_coding', 'CANONICAL': 'YES', 'CCDS': 'CCDS5931.1', 'ENSP': 'ENSP00000262189', 'SWISSPROT': 'Q8NEZ4', 'TREMBL': 'Q6N019,Q75MN6,H0YMU7', 'UNIPARC': 'UPI0000141B9F', 'RefSeq': 'NM_170606.2', 'SIFT': '', 'PolyPhen': 'probably_damaging(0.999)', 'EXON': '52/59', 'INTRON': '', 'DOMAINS': 'PROSITE_profiles:PS51542,hmmpanther:PTHR22884,hmmpanther:PTHR22884:SF305', 'AF': '', 'AFR_AF': '', 'AMR_AF': '', 'ASN_AF': '', 'EAS_AF': '', 'EUR_AF': '', 'SAS_AF': '', 'AA_AF': '', 'EA_AF': '', 'CLIN_SIG': '', 'SOMATIC': '1,1', 'PUBMED': '', 'MOTIF_NAME': '', 'MOTIF_POS': '', 'HIGH_INF_POS': '', 'MOTIF_SCORE_CHANGE': '', 'IMPACT': 'MODERATE', 'PICK': '1', 'VARIANT_CLASS': 'SNV', 'TSL': '', 'HGVS_OFFSET': '', 'PHENO': '1,1', 'MINIMISED': '', 'ExAC_AF': '', 'ExAC_AF_AFR': '', 'ExAC_AF_AMR': '', 'ExAC_AF_EAS': '', 'ExAC_AF_FIN': '', 'ExAC_AF_NFE': '', 'ExAC_AF_OTH': '', 'ExAC_AF_SAS': '', 'GENE_PHENO': '', 'FILTER': '.', 'flanking_bps': 'CGA', 'vcf_id': '.', 'vcf_qual': '.', 'ExAC_AF_Adj': '', 'ExAC_AC_AN_Adj': '', 'ExAC_AC_AN': '', 'ExAC_AC_AN_AFR': '', 'ExAC_AC_AN_AMR': '', 'ExAC_AC_AN_EAS': '', 'ExAC_AC_AN_FIN': '', 'ExAC_AC_AN_NFE': '', 'ExAC_AC_AN_OTH': '', 'ExAC_AC_AN_SAS': '', 'ExAC_FILTER': '', 'gnomAD_AF': '', 'gnomAD_AFR_AF': '', 'gnomAD_AMR_AF': '', 'gnomAD_ASJ_AF': '', 'gnomAD_EAS_AF': '', 'gnomAD_FIN_AF': '', 'gnomAD_NFE_AF': '', 'gnomAD_OTH_AF': '', 'gnomAD_SAS_AF': '', 'vcf_pos': '151845367', 'AC': '1', 'AN': '2', 'SRC': 'Sample23,', 't_GT': '0/1', 'n_GT': '', 't_FL_AD': '0', 'n_FL_AD': '', 't_FL_ADN': '0', 'n_FL_ADN': '', 't_FL_ADP': '0', 'n_FL_ADP': '', 't_FL_DP': '91', 'n_FL_DP': '', 't_FL_DPN': '48', 'n_FL_DPN': '', 't_FL_DPP': '43', 'n_FL_DPP': '', 't_FL_RD': '91', 'n_FL_RD': '', 't_FL_RDN': '48', 'n_FL_RDN': '', 't_FL_RDP': '43', 'n_FL_RDP': '', 't_FL_VF': '0', 'n_FL_VF': '', 't_AD': '511,142', 'n_AD': '', 't_DP': '653', 'n_DP': '', 't_depth_sample': '653', 't_ref_count_sample': '511', 't_alt_count_sample': '142', 'is_fillout': 'False'}, + {'Hugo_Symbol': 'KMT2C', 'Entrez_Gene_Id': '58508', 'Center': '.', 'NCBI_Build': 'GRCh37', 'Chromosome': '7', 'Start_Position': '151845367', 'End_Position': '151845367', 'Strand': '+', 'Variant_Classification': 'Missense_Mutation', 'Variant_Type': 'SNP', 'Reference_Allele': 'G', 'Tumor_Seq_Allele1': 'G', 'Tumor_Seq_Allele2': 'A', 'dbSNP_RS': '', 'dbSNP_Val_Status': '', 'Tumor_Sample_Barcode': 'Sample23', 'Matched_Norm_Sample_Barcode': 'Sample23-N', 'Match_Norm_Seq_Allele1': 'G', 'Match_Norm_Seq_Allele2': 'G', 'Tumor_Validation_Allele1': '', 'Tumor_Validation_Allele2': '', 'Match_Norm_Validation_Allele1': '', 'Match_Norm_Validation_Allele2': '', 'Verification_Status': '', 'Validation_Status': '', 'Mutation_Status': '', 'Sequencing_Phase': '', 'Sequence_Source': '', 'Validation_Method': '', 'Score': '', 'BAM_File': '', 'Sequencer': '', 'Tumor_Sample_UUID': '', 'Matched_Norm_Sample_UUID': '', 'HGVSc': 'c.13645C>T', 'HGVSp': 'p.Arg4549Cys', 'HGVSp_Short': 'p.R4549C', 'Transcript_ID': 'ENST00000262189', 'Exon_Number': '52/59', 't_depth': '653', 't_ref_count': '511', 't_alt_count': '142', 'n_depth': '', 'n_ref_count': '', 'n_alt_count': '', 'all_effects': 'KMT2C,missense_variant,p.Arg4606Cys,ENST00000355193,;KMT2C,missense_variant,p.Arg4549Cys,ENST00000262189,NM_170606.2;KMT2C,missense_variant,p.Arg2110Cys,ENST00000360104,;KMT2C,missense_variant,p.Arg1166Cys,ENST00000424877,;KMT2C,downstream_gene_variant,,ENST00000418061,;KMT2C,downstream_gene_variant,,ENST00000485241,;KMT2C,3_prime_UTR_variant,,ENST00000558084,;KMT2C,non_coding_transcript_exon_variant,,ENST00000473186,;', 'Allele': 'A', 'Gene': 'ENSG00000055609', 'Feature': 'ENST00000262189', 'Feature_type': 'Transcript', 'Consequence': 'missense_variant', 'cDNA_position': '13864/16862', 'CDS_position': '13645/14736', 'Protein_position': '4549/4911', 'Amino_acids': 'R/C', 'Codons': 'Cgc/Tgc', 'Existing_variation': 'COSM245709,COSM245710', 'ALLELE_NUM': '1', 'DISTANCE': '', 'STRAND_VEP': '-1', 'SYMBOL': 'KMT2C', 'SYMBOL_SOURCE': 'HGNC', 'HGNC_ID': '13726', 'BIOTYPE': 'protein_coding', 'CANONICAL': 'YES', 'CCDS': 'CCDS5931.1', 'ENSP': 'ENSP00000262189', 'SWISSPROT': 'Q8NEZ4', 'TREMBL': 'Q6N019,Q75MN6,H0YMU7', 'UNIPARC': 'UPI0000141B9F', 'RefSeq': 'NM_170606.2', 'SIFT': '', 'PolyPhen': 'probably_damaging(0.999)', 'EXON': '52/59', 'INTRON': '', 'DOMAINS': 'PROSITE_profiles:PS51542,hmmpanther:PTHR22884,hmmpanther:PTHR22884:SF305', 'AF': '', 'AFR_AF': '', 'AMR_AF': '', 'ASN_AF': '', 'EAS_AF': '', 'EUR_AF': '', 'SAS_AF': '', 'AA_AF': '', 'EA_AF': '', 'CLIN_SIG': '', 'SOMATIC': '1,1', 'PUBMED': '', 'MOTIF_NAME': '', 'MOTIF_POS': '', 'HIGH_INF_POS': '', 'MOTIF_SCORE_CHANGE': '', 'IMPACT': 'MODERATE', 'PICK': '1', 'VARIANT_CLASS': 'SNV', 'TSL': '', 'HGVS_OFFSET': '', 'PHENO': '1,1', 'MINIMISED': '', 'ExAC_AF': '', 'ExAC_AF_AFR': '', 'ExAC_AF_AMR': '', 'ExAC_AF_EAS': '', 'ExAC_AF_FIN': '', 'ExAC_AF_NFE': '', 'ExAC_AF_OTH': '', 'ExAC_AF_SAS': '', 'GENE_PHENO': '', 'FILTER': '.', 'flanking_bps': 'CGA', 'vcf_id': '.', 'vcf_qual': '.', 'ExAC_AF_Adj': '', 'ExAC_AC_AN_Adj': '', 'ExAC_AC_AN': '', 'ExAC_AC_AN_AFR': '', 'ExAC_AC_AN_AMR': '', 'ExAC_AC_AN_EAS': '', 'ExAC_AC_AN_FIN': '', 'ExAC_AC_AN_NFE': '', 'ExAC_AC_AN_OTH': '', 'ExAC_AC_AN_SAS': '', 'ExAC_FILTER': '', 'gnomAD_AF': '', 'gnomAD_AFR_AF': '', 'gnomAD_AMR_AF': '', 'gnomAD_ASJ_AF': '', 'gnomAD_EAS_AF': '', 'gnomAD_FIN_AF': '', 'gnomAD_NFE_AF': '', 'gnomAD_OTH_AF': '', 'gnomAD_SAS_AF': '', 'vcf_pos': '151845367', 'AC': '1', 'AN': '2', 'SRC': 'Sample23,', 't_GT': '0/1', 'n_GT': '', 't_FL_AD': '0', 'n_FL_AD': '', 't_FL_ADN': '0', 'n_FL_ADN': '', 't_FL_ADP': '0', 'n_FL_ADP': '', 't_FL_DP': '91', 'n_FL_DP': '', 't_FL_DPN': '48', 'n_FL_DPN': '', 't_FL_DPP': '43', 'n_FL_DPP': '', 't_FL_RD': '91', 'n_FL_RD': '', 't_FL_RDN': '48', 'n_FL_RDN': '', 't_FL_RDP': '43', 'n_FL_RDP': '', 't_FL_VF': '0', 'n_FL_VF': '', 't_AD': '511,142', 'n_AD': '', 't_DP': '653', 'n_DP': '', 't_depth_sample': '653', 't_ref_count_sample': '511', 't_alt_count_sample': '142', 'is_fillout': 'False'}, - {'Hugo_Symbol': 'RTEL1', 'Entrez_Gene_Id': '51750', 'Center': '.', 'NCBI_Build': 'GRCh37', 'Chromosome': '20', 'Start_Position': '62321135', 'End_Position': '62321135', 'Strand': '+', 'Variant_Classification': 'Silent', 'Variant_Type': 'SNP', 'Reference_Allele': 'G', 'Tumor_Seq_Allele1': 'G', 'Tumor_Seq_Allele2': 'A', 'dbSNP_RS': 'rs746824222', 'dbSNP_Val_Status': '', 'Tumor_Sample_Barcode': 'Sample23', 'Matched_Norm_Sample_Barcode': 'NORMAL', 'Match_Norm_Seq_Allele1': 'G', 'Match_Norm_Seq_Allele2': 'G', 'Tumor_Validation_Allele1': '', 'Tumor_Validation_Allele2': '', 'Match_Norm_Validation_Allele1': '', 'Match_Norm_Validation_Allele2': '', 'Verification_Status': '', 'Validation_Status': '', 'Mutation_Status': '', 'Sequencing_Phase': '', 'Sequence_Source': '', 'Validation_Method': '', 'Score': '', 'BAM_File': '', 'Sequencer': '', 'Tumor_Sample_UUID': '', 'Matched_Norm_Sample_UUID': '', 'HGVSc': 'c.2130G>A', 'HGVSp': 'p.=', 'HGVSp_Short': 'p.Q710=', 'Transcript_ID': 'ENST00000508582', 'Exon_Number': '24/35', 't_depth': '184', 't_ref_count': '184', 't_alt_count': '0', 'n_depth': '', 'n_ref_count': '', 'n_alt_count': '', 'all_effects': 'RTEL1,synonymous_variant,p.=,ENST00000318100,;RTEL1,synonymous_variant,p.=,ENST00000370018,NM_032957.4,NM_016434.3;RTEL1,synonymous_variant,p.=,ENST00000360203,NM_001283009.1;RTEL1,synonymous_variant,p.=,ENST00000508582,;RTEL1,synonymous_variant,p.=,ENST00000425905,;RTEL1,upstream_gene_variant,,ENST00000370003,;RTEL1-TNFRSF6B,synonymous_variant,p.=,ENST00000482936,;RTEL1-TNFRSF6B,synonymous_variant,p.=,ENST00000492259,;RTEL1-TNFRSF6B,non_coding_transcript_exon_variant,,ENST00000480273,;RTEL1-TNFRSF6B,non_coding_transcript_exon_variant,,ENST00000496281,;RTEL1,upstream_gene_variant,,ENST00000496816,;', 'Allele': 'A', 'Gene': 'ENSG00000258366', 'Feature': 'ENST00000508582', 'Feature_type': 'Transcript', 'Consequence': 'synonymous_variant', 'cDNA_position': '2476/4273', 'CDS_position': '2130/3732', 'Protein_position': '710/1243', 'Amino_acids': 'Q', 'Codons': 'caG/caA', 'Existing_variation': 'rs746824222', 'ALLELE_NUM': '1', 'DISTANCE': '', 'STRAND_VEP': '1', 'SYMBOL': 'RTEL1', 'SYMBOL_SOURCE': 'HGNC', 'HGNC_ID': '15888', 'BIOTYPE': 'protein_coding', 'CANONICAL': 'YES', 'CCDS': 'CCDS13530.3', 'ENSP': 'ENSP00000424307', 'SWISSPROT': 'Q9NZ71', 'TREMBL': '', 'UNIPARC': 'UPI00019B2219', 'RefSeq': '', 'SIFT': '', 'PolyPhen': '', 'EXON': '24/35', 'INTRON': '', 'DOMAINS': 'Superfamily_domains:SSF52540,SMART_domains:SM00491,Pfam_domain:PF13307,TIGRFAM_domain:TIGR00604,hmmpanther:PTHR11472:SF4,hmmpanther:PTHR11472', 'AF': '', 'AFR_AF': '', 'AMR_AF': '', 'ASN_AF': '', 'EAS_AF': '', 'EUR_AF': '', 'SAS_AF': '', 'AA_AF': '', 'EA_AF': '', 'CLIN_SIG': '', 'SOMATIC': '', 'PUBMED': '', 'MOTIF_NAME': '', 'MOTIF_POS': '', 'HIGH_INF_POS': '', 'MOTIF_SCORE_CHANGE': '', 'IMPACT': 'LOW', 'PICK': '1', 'VARIANT_CLASS': 'SNV', 'TSL': '', 'HGVS_OFFSET': '', 'PHENO': '', 'MINIMISED': '', 'ExAC_AF': '', 'ExAC_AF_AFR': '', 'ExAC_AF_AMR': '', 'ExAC_AF_EAS': '', 'ExAC_AF_FIN': '', 'ExAC_AF_NFE': '', 'ExAC_AF_OTH': '', 'ExAC_AF_SAS': '', 'GENE_PHENO': '1', 'FILTER': '.', 'flanking_bps': 'AGG', 'vcf_id': '.', 'vcf_qual': '.', 'ExAC_AF_Adj': '', 'ExAC_AC_AN_Adj': '', 'ExAC_AC_AN': '', 'ExAC_AC_AN_AFR': '', 'ExAC_AC_AN_AMR': '', 'ExAC_AC_AN_EAS': '', 'ExAC_AC_AN_FIN': '', 'ExAC_AC_AN_NFE': '', 'ExAC_AC_AN_OTH': '', 'ExAC_AC_AN_SAS': '', 'ExAC_FILTER': '', 'gnomAD_AF': '', 'gnomAD_AFR_AF': '', 'gnomAD_AMR_AF': '', 'gnomAD_ASJ_AF': '', 'gnomAD_EAS_AF': '', 'gnomAD_FIN_AF': '', 'gnomAD_NFE_AF': '', 'gnomAD_OTH_AF': '', 'gnomAD_SAS_AF': '', 'vcf_pos': '62321135', 'AC': '1', 'AN': '2', 'SRC': 'Sample24,', 't_GT': './.', 'n_GT': '', 't_FL_AD': '0', 'n_FL_AD': '', 't_FL_ADN': '0', 'n_FL_ADN': '', 't_FL_ADP': '0', 'n_FL_ADP': '', 't_FL_DP': '184', 'n_FL_DP': '', 't_FL_DPN': '95', 'n_FL_DPN': '', 't_FL_DPP': '89', 'n_FL_DPP': '', 't_FL_RD': '184', 'n_FL_RD': '', 't_FL_RDN': '95', 'n_FL_RDN': '', 't_FL_RDP': '89', 'n_FL_RDP': '', 't_FL_VF': '0', 'n_FL_VF': '', 't_AD': '', 'n_AD': '', 't_DP': '.', 'n_DP': '', 't_depth_sample': '', 't_ref_count_sample': '', 't_alt_count_sample': '', 'is_fillout': 'True'} + {'Hugo_Symbol': 'RTEL1', 'Entrez_Gene_Id': '51750', 'Center': '.', 'NCBI_Build': 'GRCh37', 'Chromosome': '20', 'Start_Position': '62321135', 'End_Position': '62321135', 'Strand': '+', 'Variant_Classification': 'Silent', 'Variant_Type': 'SNP', 'Reference_Allele': 'G', 'Tumor_Seq_Allele1': 'G', 'Tumor_Seq_Allele2': 'A', 'dbSNP_RS': 'rs746824222', 'dbSNP_Val_Status': '', 'Tumor_Sample_Barcode': 'Sample23', 'Matched_Norm_Sample_Barcode': 'Sample23-N', 'Match_Norm_Seq_Allele1': 'G', 'Match_Norm_Seq_Allele2': 'G', 'Tumor_Validation_Allele1': '', 'Tumor_Validation_Allele2': '', 'Match_Norm_Validation_Allele1': '', 'Match_Norm_Validation_Allele2': '', 'Verification_Status': '', 'Validation_Status': '', 'Mutation_Status': '', 'Sequencing_Phase': '', 'Sequence_Source': '', 'Validation_Method': '', 'Score': '', 'BAM_File': '', 'Sequencer': '', 'Tumor_Sample_UUID': '', 'Matched_Norm_Sample_UUID': '', 'HGVSc': 'c.2130G>A', 'HGVSp': 'p.=', 'HGVSp_Short': 'p.Q710=', 'Transcript_ID': 'ENST00000508582', 'Exon_Number': '24/35', 't_depth': '184', 't_ref_count': '184', 't_alt_count': '0', 'n_depth': '', 'n_ref_count': '', 'n_alt_count': '', 'all_effects': 'RTEL1,synonymous_variant,p.=,ENST00000318100,;RTEL1,synonymous_variant,p.=,ENST00000370018,NM_032957.4,NM_016434.3;RTEL1,synonymous_variant,p.=,ENST00000360203,NM_001283009.1;RTEL1,synonymous_variant,p.=,ENST00000508582,;RTEL1,synonymous_variant,p.=,ENST00000425905,;RTEL1,upstream_gene_variant,,ENST00000370003,;RTEL1-TNFRSF6B,synonymous_variant,p.=,ENST00000482936,;RTEL1-TNFRSF6B,synonymous_variant,p.=,ENST00000492259,;RTEL1-TNFRSF6B,non_coding_transcript_exon_variant,,ENST00000480273,;RTEL1-TNFRSF6B,non_coding_transcript_exon_variant,,ENST00000496281,;RTEL1,upstream_gene_variant,,ENST00000496816,;', 'Allele': 'A', 'Gene': 'ENSG00000258366', 'Feature': 'ENST00000508582', 'Feature_type': 'Transcript', 'Consequence': 'synonymous_variant', 'cDNA_position': '2476/4273', 'CDS_position': '2130/3732', 'Protein_position': '710/1243', 'Amino_acids': 'Q', 'Codons': 'caG/caA', 'Existing_variation': 'rs746824222', 'ALLELE_NUM': '1', 'DISTANCE': '', 'STRAND_VEP': '1', 'SYMBOL': 'RTEL1', 'SYMBOL_SOURCE': 'HGNC', 'HGNC_ID': '15888', 'BIOTYPE': 'protein_coding', 'CANONICAL': 'YES', 'CCDS': 'CCDS13530.3', 'ENSP': 'ENSP00000424307', 'SWISSPROT': 'Q9NZ71', 'TREMBL': '', 'UNIPARC': 'UPI00019B2219', 'RefSeq': '', 'SIFT': '', 'PolyPhen': '', 'EXON': '24/35', 'INTRON': '', 'DOMAINS': 'Superfamily_domains:SSF52540,SMART_domains:SM00491,Pfam_domain:PF13307,TIGRFAM_domain:TIGR00604,hmmpanther:PTHR11472:SF4,hmmpanther:PTHR11472', 'AF': '', 'AFR_AF': '', 'AMR_AF': '', 'ASN_AF': '', 'EAS_AF': '', 'EUR_AF': '', 'SAS_AF': '', 'AA_AF': '', 'EA_AF': '', 'CLIN_SIG': '', 'SOMATIC': '', 'PUBMED': '', 'MOTIF_NAME': '', 'MOTIF_POS': '', 'HIGH_INF_POS': '', 'MOTIF_SCORE_CHANGE': '', 'IMPACT': 'LOW', 'PICK': '1', 'VARIANT_CLASS': 'SNV', 'TSL': '', 'HGVS_OFFSET': '', 'PHENO': '', 'MINIMISED': '', 'ExAC_AF': '', 'ExAC_AF_AFR': '', 'ExAC_AF_AMR': '', 'ExAC_AF_EAS': '', 'ExAC_AF_FIN': '', 'ExAC_AF_NFE': '', 'ExAC_AF_OTH': '', 'ExAC_AF_SAS': '', 'GENE_PHENO': '1', 'FILTER': '.', 'flanking_bps': 'AGG', 'vcf_id': '.', 'vcf_qual': '.', 'ExAC_AF_Adj': '', 'ExAC_AC_AN_Adj': '', 'ExAC_AC_AN': '', 'ExAC_AC_AN_AFR': '', 'ExAC_AC_AN_AMR': '', 'ExAC_AC_AN_EAS': '', 'ExAC_AC_AN_FIN': '', 'ExAC_AC_AN_NFE': '', 'ExAC_AC_AN_OTH': '', 'ExAC_AC_AN_SAS': '', 'ExAC_FILTER': '', 'gnomAD_AF': '', 'gnomAD_AFR_AF': '', 'gnomAD_AMR_AF': '', 'gnomAD_ASJ_AF': '', 'gnomAD_EAS_AF': '', 'gnomAD_FIN_AF': '', 'gnomAD_NFE_AF': '', 'gnomAD_OTH_AF': '', 'gnomAD_SAS_AF': '', 'vcf_pos': '62321135', 'AC': '1', 'AN': '2', 'SRC': 'Sample24,', 't_GT': './.', 'n_GT': '', 't_FL_AD': '0', 'n_FL_AD': '', 't_FL_ADN': '0', 'n_FL_ADN': '', 't_FL_ADP': '0', 'n_FL_ADP': '', 't_FL_DP': '184', 'n_FL_DP': '', 't_FL_DPN': '95', 'n_FL_DPN': '', 't_FL_DPP': '89', 'n_FL_DPP': '', 't_FL_RD': '184', 'n_FL_RD': '', 't_FL_RDN': '95', 'n_FL_RDN': '', 't_FL_RDP': '89', 'n_FL_RDP': '', 't_FL_VF': '0', 'n_FL_VF': '', 't_AD': '', 'n_AD': '', 't_DP': '.', 'n_DP': '', 't_depth_sample': '', 't_ref_count_sample': '', 't_alt_count_sample': '', 'is_fillout': 'True'} ] self.assertEqual(records, expected_records) From 11466570dbdd5409c5f39faa428c0b64260d8830 Mon Sep 17 00:00:00 2001 From: Stephen Kelly Date: Tue, 25 Jan 2022 18:40:27 -0500 Subject: [PATCH 2/4] update submodule --- pluto | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pluto b/pluto index 7372d40..f2e6c56 160000 --- a/pluto +++ b/pluto @@ -1 +1 @@ -Subproject commit 7372d40fc2393dfad168faea149ef23326b3de1a +Subproject commit f2e6c5603db5ae4d7c1b13ea09ac7bd220f0c778 From 4b296760dcf26b1ce2e196d22329c5cdafd7f708 Mon Sep 17 00:00:00 2001 From: Stephen Kelly Date: Tue, 25 Jan 2022 20:19:29 -0500 Subject: [PATCH 3/4] update test cases for samples fillout workflow --- tests/test_samples_fillout_workflow_cwl.py | 80 ++++++++++++++-------- 1 file changed, 50 insertions(+), 30 deletions(-) diff --git a/tests/test_samples_fillout_workflow_cwl.py b/tests/test_samples_fillout_workflow_cwl.py index 8475f0a..3edbd1d 100644 --- a/tests/test_samples_fillout_workflow_cwl.py +++ b/tests/test_samples_fillout_workflow_cwl.py @@ -143,21 +143,21 @@ def test_run_fillout_workflow(self): } output_json, output_dir = self.run_cwl() + output_path = os.path.join(output_dir,'output.maf') expected_output = { 'output_file': { - 'location': 'file://' + os.path.join(output_dir,'output.maf'), + 'location': 'file://' + output_path, 'basename': 'output.maf', 'class': 'File', 'checksum': 'sha1$7932ae9938a5686f6328f143a6c82308877cb822', 'size': 8008, - 'path': os.path.join(output_dir,'output.maf') + 'path': output_path } } - self.assertEqual(output_json, expected_output) + self.assertCWLDictEqual(output_json, expected_output) - output_file = output_json['output_file']['path'] - reader = TableReader(output_file) + reader = TableReader(output_path) comments = reader.comment_lines fieldnames = reader.get_fieldnames() records = [ rec for rec in reader.read() ] @@ -188,40 +188,47 @@ def test_run_fillout_workflow2(self): This test uses full samples """ self.maxDiff = None - + maf1 = os.path.join(self.DATA_SETS['Proj_1']['MAF_DIR'], "Sample1.Sample2.muts.maf") + maf24 = os.path.join(self.DATA_SETS['Proj_1']['MAF_DIR'], "Sample24.Sample23.muts.maf") self.input = { + "samples": [ + { + "sample_id": "Sample1", + "normal_id": "Sample1-N", + "maf_file": { "class": "File", "path": maf1 } + }, + { + "sample_id": "Sample24", + "normal_id": "Sample24-N", + "maf_file": { "class": "File", "path": maf24 } + }, + ], "ref_fasta": {"class": "File", "path": self.DATA_SETS['Proj_1']['REF_FASTA']}, - "sample_ids": ["Sample1", "Sample24"], "bam_files": [ { "class": "File", "path": os.path.join(self.DATA_SETS['Proj_1']['BAM_DIR'], "Sample1.bam") }, { "class": "File", "path": os.path.join(self.DATA_SETS['Proj_1']['BAM_DIR'], "Sample24.bam") } - ], - "maf_files": [ - { "class": "File", "path": os.path.join(self.DATA_SETS['Proj_1']['MAF_DIR'], "Sample1.Sample2.muts.maf") }, - { "class": "File", "path": os.path.join(self.DATA_SETS['Proj_1']['MAF_DIR'], "Sample24.Sample23.muts.maf") } ] } output_json, output_dir = self.run_cwl() - + output_path = os.path.join(output_dir,'output.maf') expected_output = { 'output_file': { - 'location': 'file://' + os.path.join(output_dir,'output.maf'), + 'location': 'file://' + output_path, 'basename': 'output.maf', 'class': 'File', # 'checksum': 'sha1$be8534bcaf326de029790a832ab5b44a17a03d22', # 'size': 40194610, - 'path': os.path.join(output_dir,'output.maf') + 'path': output_path } } # NOTE: for some reason, this file keeps coming out with different annotations for 'splice_acceptor_variant' or `splice_donor_variant` # this keeps changing the byte size and checksum so need to remove those here for now output_json['output_file'].pop('checksum') output_json['output_file'].pop('size') - self.assertEqual(output_json, expected_output) + self.assertCWLDictEqual(output_json, expected_output) - output_file = output_json['output_file']['path'] - comments, mutations = self.load_mutations(output_file) + comments, mutations = self.load_mutations(output_path) self.assertEqual(len(mutations), 38920) @@ -232,7 +239,7 @@ def test_run_fillout_workflow2(self): mut.pop('Variant_Classification') hash = md5_obj(mutations) - expected_hash = '4b25d900ab90e0ed0b3702666ff01e94' + expected_hash = 'bc5f54f1057a7ba29f55d9d4aac92a01' self.assertEqual(hash, expected_hash) @@ -245,38 +252,51 @@ def test_run_fillout_workflow3(self): This test uses full samples """ self.maxDiff = None - + maf1 = os.path.join(self.DATA_SETS['Proj_1']['MAF_DIR'], "Sample1.Sample2.muts.maf") + maf4 = os.path.join(self.DATA_SETS['Proj_1']['MAF_DIR'], "Sample4.Sample3.muts.maf") self.input = { + "samples": [ + { + "sample_id": "Sample1", + "normal_id": "Sample1-N", + "maf_file": { "class": "File", "path": maf1 } + }, + { + "sample_id": "Sample4", + "normal_id": "Sample4-N", + "maf_file": { "class": "File", "path": maf4 } + }, + ], "ref_fasta": {"class": "File", "path": self.DATA_SETS['Proj_1']['REF_FASTA']}, - "sample_ids": ["Sample1", "Sample4"], + # "sample_ids": ["Sample1", "Sample4"], "bam_files": [ { "class": "File", "path": os.path.join(self.DATA_SETS['Proj_1']['BAM_DIR'], "Sample1.bam") }, { "class": "File", "path": os.path.join(self.DATA_SETS['Proj_1']['BAM_DIR'], "Sample4.bam") } - ], - "maf_files": [ - { "class": "File", "path": os.path.join(self.DATA_SETS['Proj_1']['MAF_DIR'], "Sample1.Sample2.muts.maf") }, - { "class": "File", "path": os.path.join(self.DATA_SETS['Proj_1']['MAF_DIR'], "Sample4.Sample3.muts.maf") } ] + # "maf_files": [ + # { "class": "File", "path": os.path.join(self.DATA_SETS['Proj_1']['MAF_DIR'], "Sample1.Sample2.muts.maf") }, + # { "class": "File", "path": os.path.join(self.DATA_SETS['Proj_1']['MAF_DIR'], "Sample4.Sample3.muts.maf") } + # ] } output_json, output_dir = self.run_cwl() + output_path = os.path.join(output_dir,'output.maf') expected_output = { 'output_file': { - 'location': 'file://' + os.path.join(output_dir,'output.maf'), + 'location': 'file://' + output_path, 'basename': 'output.maf', 'class': 'File', # 'checksum': 'sha1$2f60f58389ec65af87612c7532ad28b882fb84ba', # 'size': 26238820, - 'path': os.path.join(output_dir,'output.maf') + 'path': output_path } } output_json['output_file'].pop('checksum') output_json['output_file'].pop('size') - self.assertEqual(output_json, expected_output) + self.assertCWLDictEqual(output_json, expected_output) - output_file = output_json['output_file']['path'] - comments, mutations = self.load_mutations(output_file) + comments, mutations = self.load_mutations(output_path) hash = md5_obj(mutations) @@ -289,7 +309,7 @@ def test_run_fillout_workflow3(self): mut.pop('Variant_Classification') hash = md5_obj(mutations) - expected_hash = '77fb1f3aa26ddf06029232ee720a709c' + expected_hash = '4a03d128d76b72328b62a87814d89993' self.assertEqual(hash, expected_hash) From 342e6f1f4f7a3839e579fbe96ccc8d6f7a61ac77 Mon Sep 17 00:00:00 2001 From: Stephen Kelly Date: Wed, 26 Jan 2022 14:38:00 -0500 Subject: [PATCH 4/4] add samples input schema to samples_fillout_index_workflow --- cwl/samples_fillout_index_workflow.cwl | 114 +++++++++++++----- cwl/samples_fillout_workflow.cwl | 2 + ...test_samples_fillout_index_workflow_cwl.py | 45 ++++--- tests/test_samples_fillout_workflow_cwl.py | 5 - 4 files changed, 114 insertions(+), 52 deletions(-) diff --git a/cwl/samples_fillout_index_workflow.cwl b/cwl/samples_fillout_index_workflow.cwl index 6de0f4f..c6d6102 100644 --- a/cwl/samples_fillout_index_workflow.cwl +++ b/cwl/samples_fillout_index_workflow.cwl @@ -18,11 +18,15 @@ requirements: SubworkflowFeatureRequirement: {} inputs: - sample_ids: + samples: type: - type: array - items: string - + type: array + items: + type: record + fields: + maf_file: File + sample_id: string # must match sample ID used inside maf file + normal_id: string bam_files: type: type: array @@ -30,26 +34,20 @@ inputs: secondaryFiles: - ^.bai - maf_files: + unindexed_samples: type: type: array - items: File + items: + type: record + fields: + maf_file: File + sample_id: string # must match sample ID used inside maf file + normal_id: string unindexed_bam_files: type: type: array items: File - - unindexed_sample_ids: - type: - type: array - items: string - - unindexed_maf_files: - type: - type: array - items: File - ref_fasta: type: File secondaryFiles: @@ -60,8 +58,8 @@ inputs: - .sa - .fai - ^.dict - exac_filter: # need this to resolve error in subworkflow: Anonymous file object must have 'contents' and 'basename' fields. + # TODO: this needs the .tbi/.csi index file added!! type: File default: class: File @@ -85,41 +83,95 @@ steps: run: index_bam.cwl in: bam: unindexed_bam_files - scatter: [ bam ] - scatterMethod: dotproduct + scatter: bam out: [ bam_indexed ] # run filter script to apply cBioPortal filters on variants for fillout run_maf_filter: run: maf_filter.cwl in: - maf_file: maf_files + sample: samples + maf_file: + valueFrom: ${ return inputs.sample['maf_file']; } is_impact: is_impact argos_version_string: argos_version_string - scatter: [ maf_file ] - scatterMethod: dotproduct + scatter: sample out: [ cbio_mutation_data_file ] - # run the actual fillout workflow + run_maf_filter_unindexed: + run: maf_filter.cwl + in: + sample: unindexed_samples + maf_file: + valueFrom: ${ return inputs.sample['maf_file']; } + is_impact: is_impact + argos_version_string: argos_version_string + scatter: sample + out: [ cbio_mutation_data_file ] + + # update the samples to use the new filtered maf files and output a single list of samples + merge_samples_replace_mafs: + in: + samples: + source: [ samples, unindexed_samples ] + linkMerge: merge_flattened + maf_files: + source: [ run_maf_filter/cbio_mutation_data_file, run_maf_filter_unindexed/cbio_mutation_data_file ] + linkMerge: merge_flattened + out: [ samples ] + run: + class: ExpressionTool + inputs: + samples: + type: + type: array + items: + type: record + fields: + maf_file: File + sample_id: string + normal_id: string + maf_files: File[] + outputs: + samples: + type: + type: array + items: + type: record + fields: + maf_file: File + sample_id: string + normal_id: string + # NOTE: in the line below `var i in inputs.samples`, `i` is an int representing the index position in the array `inputs.samples` + # in Python it would look like ` x = ['a', 'b']; for i in range(len(x)): print(i, x[i]) ` + expression: "${ + var new_samples = []; + + for ( var i in inputs.samples ){ + new_samples.push({ + 'sample_id': inputs.samples[i]['sample_id'], + 'normal_id': inputs.samples[i]['normal_id'], + 'maf_file': inputs.maf_files[i] + }); + }; + + return {'samples': new_samples}; + }" + + # run the fillout workflow run_samples_fillout: run: samples_fillout_workflow.cwl in: output_fname: fillout_output_fname exac_filter: exac_filter - sample_ids: - source: [ sample_ids, unindexed_sample_ids ] - linkMerge: merge_flattened + samples: merge_samples_replace_mafs/samples bam_files: source: [ bam_files, run_indexer/bam_indexed ] linkMerge: merge_flattened - maf_files: - source: [ run_maf_filter/cbio_mutation_data_file, unindexed_maf_files ] - linkMerge: merge_flattened ref_fasta: ref_fasta out: [ output_file ] outputs: - output_file: type: File outputSource: run_samples_fillout/output_file diff --git a/cwl/samples_fillout_workflow.cwl b/cwl/samples_fillout_workflow.cwl index 7793d8b..69c6520 100644 --- a/cwl/samples_fillout_workflow.cwl +++ b/cwl/samples_fillout_workflow.cwl @@ -71,6 +71,8 @@ steps: sample_id: string outputs: sample_ids: string[] + # NOTE: in the line below `var i in inputs.samples`, `i` is an int representing the index position in the array `inputs.samples` + # in Python it would look like ` x = ['a', 'b']; for i in range(len(x)): print(i, x[i]) ` expression: "${ var sample_ids = []; for ( var i in inputs.samples ){ diff --git a/tests/test_samples_fillout_index_workflow_cwl.py b/tests/test_samples_fillout_index_workflow_cwl.py index 26761f9..437e00c 100644 --- a/tests/test_samples_fillout_index_workflow_cwl.py +++ b/tests/test_samples_fillout_index_workflow_cwl.py @@ -32,40 +32,54 @@ def test_run_fillout_workflow(self): """ Test case for running the fillout workflow on a number of samples, each with a bam and maf """ + # self.preserve = True self.maxDiff = None self.runner_args['use_cache'] = False # do not use cache for samples fillout workflow it breaks on split_vcf_to_mafs + # self.runner_args['debug'] = True + # self.runner_args['js_console'] = True self.input = { - "ref_fasta": {"class": "File", "path": self.DATA_SETS['Proj_08390_G']['REF_FASTA']}, - "sample_ids": ["Sample1"], + "samples": [ + { + "sample_id": "Sample1", + "normal_id": "Sample1-N", + "maf_file": { "class": "File", "path": self.sample1_maf } + } + ], "bam_files": [ { "class": "File", "path": self.sample1_bam } ], - "maf_files": [ - { "class": "File", "path": self.sample1_maf } + "unindexed_samples": [ + { + "sample_id": "Sample4", + "normal_id": "DMP-MATCHED-NORMAL", + "maf_file": { "class": "File", "path": self.sample4_maf } + }, + { + "sample_id": "Sample24", + "normal_id": "DMP-UNMATCHED-NORMAL", + "maf_file": { "class": "File", "path": self.sample24_maf } + }, ], - "unindexed_sample_ids": ["Sample4", "Sample24"], "unindexed_bam_files": [ { "class": "File", "path": self.sample4_bam }, { "class": "File", "path": self.sample24_bam } ], - "unindexed_maf_files": [ - { "class": "File", "path": self.sample4_maf }, - { "class": "File", "path": self.sample24_maf } - ], - "fillout_output_fname": 'output.maf' + "fillout_output_fname": 'output.maf', + "ref_fasta": {"class": "File", "path": self.DATA_SETS['Proj_08390_G']['REF_FASTA']}, } output_json, output_dir = self.run_cwl() + output_file = os.path.join(output_dir,'output.maf') expected_output = { 'output_file': { - 'location': 'file://' + os.path.join(output_dir,'output.maf'), + 'location': 'file://' + output_file, 'basename': 'output.maf', 'class': 'File', # 'checksum': 'sha1$d8d63a0aca2da20d2d15e26fcf64fd6295eda05e', # 'size': 25838928, - 'path': os.path.join(output_dir,'output.maf') + 'path': output_file } } # NOTE: for some reason, this file keeps coming out with different annotations for 'splice_acceptor_variant' or `splice_donor_variant` @@ -74,10 +88,9 @@ def test_run_fillout_workflow(self): output_json['output_file'].pop('size') self.assertCWLDictEqual(output_json, expected_output) - output_file = output_json['output_file']['path'] + # instead of checksum and size, count the number of mutations and take a checksum on the mutation contents comments, mutations = self.load_mutations(output_file) - - self.assertEqual(len(mutations), 23742) + self.assertEqual(len(mutations), 117) # Need to remove these fields because they are inconsistent on the output maf file; for mut in mutations: @@ -86,7 +99,7 @@ def test_run_fillout_workflow(self): mut.pop('Variant_Classification') hash = md5_obj(mutations) - expected_hash = 'f153c68bc79a6f28e261ec04f51b2111' + expected_hash = '01af6b281f70e6821addce80a2ec5cf8' self.assertEqual(hash, expected_hash) diff --git a/tests/test_samples_fillout_workflow_cwl.py b/tests/test_samples_fillout_workflow_cwl.py index 3edbd1d..5a56c61 100644 --- a/tests/test_samples_fillout_workflow_cwl.py +++ b/tests/test_samples_fillout_workflow_cwl.py @@ -268,15 +268,10 @@ def test_run_fillout_workflow3(self): }, ], "ref_fasta": {"class": "File", "path": self.DATA_SETS['Proj_1']['REF_FASTA']}, - # "sample_ids": ["Sample1", "Sample4"], "bam_files": [ { "class": "File", "path": os.path.join(self.DATA_SETS['Proj_1']['BAM_DIR'], "Sample1.bam") }, { "class": "File", "path": os.path.join(self.DATA_SETS['Proj_1']['BAM_DIR'], "Sample4.bam") } ] - # "maf_files": [ - # { "class": "File", "path": os.path.join(self.DATA_SETS['Proj_1']['MAF_DIR'], "Sample1.Sample2.muts.maf") }, - # { "class": "File", "path": os.path.join(self.DATA_SETS['Proj_1']['MAF_DIR'], "Sample4.Sample3.muts.maf") } - # ] } output_json, output_dir = self.run_cwl()