From c54aacf3b0dafd7c009ec32fc255238afe4188c1 Mon Sep 17 00:00:00 2001 From: Susanna Kiwala Date: Wed, 7 Dec 2016 08:52:47 -0600 Subject: [PATCH] Adapt main pipeline to accept INTEGRATE-Neo fusion files --- pvacseq/lib/main.py | 12 +++++++++- pvacseq/lib/pipeline.py | 49 +++++++++++++++++++++++++++++++++++------ 2 files changed, 53 insertions(+), 8 deletions(-) diff --git a/pvacseq/lib/main.py b/pvacseq/lib/main.py index cdf6796..5d47c5e 100644 --- a/pvacseq/lib/main.py +++ b/pvacseq/lib/main.py @@ -27,7 +27,9 @@ def define_parser(): parser.add_argument( "input_file", - help="A VEP-annotated single-sample VCF containing transcript, Wildtype protein sequence, and Downstream protein sequence information" + help="The variant input file to process. This can either be a VEP-annotated single-sample VCF " + + "containing transcript, Wildtype protein sequence, and Downstream protein sequence information, " + + "or a INTEGRATE-Neo bedpe file with fusions." ) parser.add_argument( "sample_name", @@ -183,6 +185,13 @@ def main(args_input = sys.argv[1:]): parser = define_parser() args = parser.parse_args(args_input) + if args.input_file.endswith('.vcf'): + input_file_type = 'vcf' + elif args.input_file.endswith('.bedpe'): + input_file_type = 'bedpe' + else: + sys.exit("Unknown input file type for file (%s). Input file must be either a VCF (.vcf) or a bedpe (.bedpe) file." % input_file) + if "." in args.sample_name: sys.exit("Sample name cannot contain '.'") @@ -226,6 +235,7 @@ def main(args_input = sys.argv[1:]): shared_arguments = { 'input_file' : args.input_file, + 'input_file_type' : input_file_type, 'sample_name' : args.sample_name, 'top_result_per_mutation' : args.top_result_per_mutation, 'top_score_metric' : args.top_score_metric, diff --git a/pvacseq/lib/pipeline.py b/pvacseq/lib/pipeline.py index fc7e1b8..87fe5df 100644 --- a/pvacseq/lib/pipeline.py +++ b/pvacseq/lib/pipeline.py @@ -20,6 +20,7 @@ def status_message(msg): class Pipeline(metaclass=ABCMeta): def __init__(self, **kwargs): self.input_file = kwargs['input_file'] + self.input_file_type = kwargs['input_file_type'] self.sample_name = kwargs['sample_name'] self.alleles = kwargs['alleles'] self.prediction_algorithms = kwargs['prediction_algorithms'] @@ -59,8 +60,39 @@ def tsv_file_path(self): tsv_file = self.sample_name + '.tsv' return os.path.join(self.output_dir, tsv_file) + def converter(self, params): + converter_types = { + 'vcf' : 'VcfConverter', + 'bedpe': 'IntegrateConverter', + } + converter_type = converter_types[self.input_file_type] + converter = getattr(sys.modules[__name__], converter_type) + return converter(**params) + + def fasta_generator(self, params): + generator_types = { + 'vcf' : 'FastaGenerator', + 'bedpe': 'FusionFastaGenerator', + } + generator_type = generator_types[self.input_file_type] + generator = getattr(sys.modules[__name__], generator_type) + return generator(**params) + + def output_parser(self, params): + parser_types = { + 'vcf' : 'DefaultOutputParser', + 'bedpe': 'FusionOutputParser', + } + parser_type = parser_types[self.input_file_type] + parser = getattr(sys.modules[__name__], parser_type) + return parser(**params) + + def tsv_file_path(self): + tsv_file = self.sample_name + '.tsv' + return os.path.join(self.output_dir, tsv_file) + def convert_vcf(self): - status_message("Converting VCF to TSV") + status_message("Converting .%s to TSV" % self.input_file_type) if os.path.exists(self.tsv_file_path()): status_message("TSV file already exists. Skipping.") return @@ -84,7 +116,7 @@ def convert_vcf(self): else: convert_params[attribute] = None - converter = VcfConverter(**convert_params) + converter = self.converter(convert_params) converter.execute() print("Completed") @@ -246,7 +278,10 @@ def execute(self): total_row_count = self.tsv_entry_count() if total_row_count == 0: - sys.exit("The TSV file is empty. Please check that the input VCF contains missense, inframe indel, or frameshift mutations.") + if self.input_file_type == 'vcf': + sys.exit("The TSV file is empty. Please check that the input VCF contains missense, inframe indel, or frameshift mutations.") + elif self.input_file_type == 'bedpe': + sys.exit("The TSV file is empty. Please check tha the input bedpe file contains fusion entries.") chunks = self.split_tsv_file(total_row_count) self.generate_fasta(chunks) @@ -324,7 +359,7 @@ def generate_fasta(self, chunks): 'output_key_file' : split_fasta_key_file_path, 'downstream_sequence_length': self.downstream_sequence_length, } - fasta_generator = FastaGenerator(**generate_fasta_params) + fasta_generator = self.fasta_generator(generate_fasta_params) fasta_generator.execute() status_message("Completed") @@ -387,7 +422,7 @@ def call_iedb_and_parse_outputs(self, chunks): 'top_score_metric' : self.top_score_metric, 'top_result_per_mutation': self.top_result_per_mutation } - parser = DefaultOutputParser(**params) + parser = self.output_parser(params) parser.execute() status_message("Completed") split_parsed_output_files.append(split_parsed_file_path) @@ -418,7 +453,7 @@ def generate_fasta(self, chunks): 'output_key_file' : split_fasta_key_file_path, 'downstream_sequence_length': self.downstream_sequence_length, } - fasta_generator = FastaGenerator(**generate_fasta_params) + fasta_generator = self.fasta_generator(generate_fasta_params) fasta_generator.execute() status_message("Completed") @@ -475,7 +510,7 @@ def call_iedb_and_parse_outputs(self, chunks): 'top_score_metric' : self.top_score_metric, 'top_result_per_mutation': self.top_result_per_mutation } - parser = DefaultOutputParser(**params) + parser = self.output_parser(params) parser.execute() status_message("Completed") split_parsed_output_files.append(split_parsed_file_path)