Merge pull request #242 from susannasiebert/fusions

Integrate fusion processing into the pipeline
griffithlab · Jan 6, 2017 · cbf0a94 · cbf0a94
2 parents 1b4961a + 49ea946
commit cbf0a94
Show file tree

Hide file tree

Showing 14 changed files with 677 additions and 17 deletions.
diff --git a/pvacseq/lib/main.py b/pvacseq/lib/main.py
@@ -27,7 +27,9 @@ def define_parser():
 
     parser.add_argument(
         "input_file",
-        help="A VEP-annotated single-sample VCF containing transcript, Wildtype protein sequence, and Downstream protein sequence information"
+        help="The variant input file to process. This can either be a VEP-annotated single-sample VCF "
+             + "containing transcript, Wildtype protein sequence, and Downstream protein sequence information, "
+             + "or a INTEGRATE-Neo bedpe file with fusions."
     )
     parser.add_argument(
         "sample_name",
@@ -183,6 +185,13 @@ def main(args_input = sys.argv[1:]):
     parser = define_parser()
     args = parser.parse_args(args_input)
 
+    if args.input_file.endswith('.vcf'):
+        input_file_type = 'vcf'
+    elif args.input_file.endswith('.bedpe'):
+        input_file_type = 'bedpe'
+    else:
+        sys.exit("Unknown input file type for file (%s). Input file must be either a VCF (.vcf) or a bedpe (.bedpe) file." % input_file)
+
     PredictionClass.check_alleles_valid(args.allele)
 
     if "." in args.sample_name:
@@ -223,6 +232,7 @@ def main(args_input = sys.argv[1:]):
 
     shared_arguments = {
         'input_file'                : args.input_file,
+        'input_file_type'           : input_file_type,
         'sample_name'               : args.sample_name,
         'top_result_per_mutation'   : args.top_result_per_mutation,
         'top_score_metric'          : args.top_score_metric,

diff --git a/pvacseq/lib/pipeline.py b/pvacseq/lib/pipeline.py
@@ -20,6 +20,7 @@ def status_message(msg):
 class Pipeline(metaclass=ABCMeta):
     def __init__(self, **kwargs):
         self.input_file                  = kwargs['input_file']
+        self.input_file_type             = kwargs['input_file_type']
         self.sample_name                 = kwargs['sample_name']
         self.alleles                     = kwargs['alleles']
         self.prediction_algorithms       = kwargs['prediction_algorithms']
@@ -59,8 +60,39 @@ def tsv_file_path(self):
         tsv_file = self.sample_name + '.tsv'
         return os.path.join(self.output_dir, tsv_file)
 
+    def converter(self, params):
+        converter_types = {
+            'vcf'  : 'VcfConverter',
+            'bedpe': 'IntegrateConverter',
+        }
+        converter_type = converter_types[self.input_file_type]
+        converter = getattr(sys.modules[__name__], converter_type)
+        return converter(**params)
+
+    def fasta_generator(self, params):
+        generator_types = {
+            'vcf'  : 'FastaGenerator',
+            'bedpe': 'FusionFastaGenerator',
+        }
+        generator_type = generator_types[self.input_file_type]
+        generator = getattr(sys.modules[__name__], generator_type)
+        return generator(**params)
+
+    def output_parser(self, params):
+        parser_types = {
+            'vcf'  : 'DefaultOutputParser',
+            'bedpe': 'FusionOutputParser',
+        }
+        parser_type = parser_types[self.input_file_type]
+        parser = getattr(sys.modules[__name__], parser_type)
+        return parser(**params)
+
+    def tsv_file_path(self):
+        tsv_file = self.sample_name + '.tsv'
+        return os.path.join(self.output_dir, tsv_file)
+
     def convert_vcf(self):
-        status_message("Converting VCF to TSV")
+        status_message("Converting .%s to TSV" % self.input_file_type)
         if os.path.exists(self.tsv_file_path()):
             status_message("TSV file already exists. Skipping.")
             return
@@ -84,7 +116,7 @@ def convert_vcf(self):
             else:
                 convert_params[attribute] = None
 
-        converter = VcfConverter(**convert_params)
+        converter = self.converter(convert_params)
         converter.execute()
         print("Completed")
 
@@ -246,7 +278,10 @@ def execute(self):
 
         total_row_count = self.tsv_entry_count()
         if total_row_count == 0:
-            sys.exit("The TSV file is empty. Please check that the input VCF contains missense, inframe indel, or frameshift mutations.")
+            if self.input_file_type == 'vcf':
+                sys.exit("The TSV file is empty. Please check that the input VCF contains missense, inframe indel, or frameshift mutations.")
+            elif self.input_file_type == 'bedpe':
+                sys.exit("The TSV file is empty. Please check that the input bedpe file contains fusion entries.")
         chunks = self.split_tsv_file(total_row_count)
 
         self.generate_fasta(chunks)
@@ -324,7 +359,7 @@ def generate_fasta(self, chunks):
                 'output_key_file'           : split_fasta_key_file_path,
                 'downstream_sequence_length': self.downstream_sequence_length,
             }
-            fasta_generator = FastaGenerator(**generate_fasta_params)
+            fasta_generator = self.fasta_generator(generate_fasta_params)
             fasta_generator.execute()
         status_message("Completed")
 
@@ -387,7 +422,7 @@ def call_iedb_and_parse_outputs(self, chunks):
                             'top_score_metric'       : self.top_score_metric,
                             'top_result_per_mutation': self.top_result_per_mutation
                         }
-                        parser = DefaultOutputParser(**params)
+                        parser = self.output_parser(params)
                         parser.execute()
                         status_message("Completed")
                         split_parsed_output_files.append(split_parsed_file_path)
@@ -418,7 +453,7 @@ def generate_fasta(self, chunks):
                 'output_key_file'           : split_fasta_key_file_path,
                 'downstream_sequence_length': self.downstream_sequence_length,
             }
-            fasta_generator = FastaGenerator(**generate_fasta_params)
+            fasta_generator = self.fasta_generator(generate_fasta_params)
             fasta_generator.execute()
         status_message("Completed")
 
@@ -475,7 +510,7 @@ def call_iedb_and_parse_outputs(self, chunks):
                         'top_score_metric'       : self.top_score_metric,
                         'top_result_per_mutation': self.top_result_per_mutation
                     }
-                    parser = DefaultOutputParser(**params)
+                    parser = self.output_parser(params)
                     parser.execute()
                     status_message("Completed")
                     split_parsed_output_files.append(split_parsed_file_path)