remove input maf filtering on unindexed samples in the fillout workflow

mskcc · Feb 7, 2022 · d8a8af9 · d8a8af9
1 parent 852d935
commit d8a8af9
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 21 deletions.
diff --git a/cwl/samples_fillout_index_workflow.cwl b/cwl/samples_fillout_index_workflow.cwl
@@ -18,7 +18,7 @@ requirements:
   SubworkflowFeatureRequirement: {}
 
 inputs:
-  samples:
+  samples: # NOTE: in prod, these end up being the research samples
     type:
       type: array
       items:
@@ -34,7 +34,7 @@ inputs:
     secondaryFiles:
         - ^.bai
 
-  unindexed_samples:
+  unindexed_samples: # NOTE: in prod, these end up being the clinical samples
     type:
       type: array
       items:
@@ -65,6 +65,7 @@ inputs:
       class: File
       path: /juno/work/ci/resources/vep/cache/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz
 
+  # these are needed for the filter script
   is_impact:
     type: boolean
     default: True
@@ -98,26 +99,27 @@ steps:
     scatter: sample
     out: [ cbio_mutation_data_file ]
 
-  run_maf_filter_unindexed:
-    run: maf_filter.cwl
-    in:
-      sample: unindexed_samples
-      maf_file:
-        valueFrom: ${ return inputs.sample['maf_file']; }
-      is_impact: is_impact
-      argos_version_string: argos_version_string
-    scatter: sample
-    out: [ cbio_mutation_data_file ]
+  # NOTE: In prod, the unindexed_samples end up being the clinical samples; we do not want to apply filter to the clinical mutations input files
+  # run_maf_filter_unindexed:
+  #   run: maf_filter.cwl
+  #   in:
+  #     sample: unindexed_samples
+  #     maf_file:
+  #       valueFrom: ${ return inputs.sample['maf_file']; }
+  #     is_impact: is_impact
+  #     argos_version_string: argos_version_string
+  #   scatter: sample
+  #   out: [ cbio_mutation_data_file ]
 
   # update the samples to use the new filtered maf files and output a single list of samples
   merge_samples_replace_mafs:
     in:
-      samples:
-        source: [ samples, unindexed_samples ]
-        linkMerge: merge_flattened
-      maf_files:
-        source: [ run_maf_filter/cbio_mutation_data_file, run_maf_filter_unindexed/cbio_mutation_data_file ]
-        linkMerge: merge_flattened
+      samples: samples
+        # source: [ samples, unindexed_samples ]
+        # linkMerge: merge_flattened
+      maf_files: run_maf_filter/cbio_mutation_data_file
+        # source: [ run_maf_filter/cbio_mutation_data_file, run_maf_filter_unindexed/cbio_mutation_data_file ]
+        # linkMerge: merge_flattened
     out: [ samples ]
     run:
       class: ExpressionTool
@@ -164,7 +166,10 @@ steps:
     in:
       output_fname: fillout_output_fname
       exac_filter: exac_filter
-      samples: merge_samples_replace_mafs/samples
+      # samples: merge_samples_replace_mafs/samples
+      samples:
+        source: [ merge_samples_replace_mafs/samples, unindexed_samples ]
+        linkMerge: merge_flattened
       bam_files:
         source: [ bam_files, run_indexer/bam_indexed ]
         linkMerge: merge_flattened

diff --git a/tests/test_samples_fillout_index_workflow_cwl.py b/tests/test_samples_fillout_index_workflow_cwl.py
@@ -90,7 +90,7 @@ def test_run_fillout_workflow(self):
 
         # instead of checksum and size, count the number of mutations and take a checksum on the mutation contents
         comments, mutations = self.load_mutations(output_file)
-        self.assertEqual(len(mutations), 117)
+        self.assertEqual(len(mutations), 23742)
 
         # Need to remove these fields because they are inconsistent on the output maf file;
         for mut in mutations:
@@ -99,7 +99,7 @@ def test_run_fillout_workflow(self):
             mut.pop('Variant_Classification')
 
         hash = md5_obj(mutations)
-        expected_hash = '01af6b281f70e6821addce80a2ec5cf8'
+        expected_hash = 'c96f641cb134ed99c49aed7d42a0f5af'
         self.assertEqual(hash, expected_hash)