From b4b8b9385dc1383037ab79ae64c0fcabc52b355d Mon Sep 17 00:00:00 2001 From: evanbiederstedt Date: Mon, 29 Jul 2019 15:57:53 -0400 Subject: [PATCH 1/4] removes use of pd.read_table() --- neoantigen.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neoantigen.py b/neoantigen.py index c1d830a..f751750 100644 --- a/neoantigen.py +++ b/neoantigen.py @@ -336,7 +336,7 @@ def main(): logger.info('Starting NetMHC 4.0...') ##### # For netMHC-4 prediction, only predict on alleles for which data exists - netmhc_alleles = list(pd.read_table(netmhc4_alleleslist, header=None, usecols=[0])[0]) + netmhc_alleles = list(pd.read_csv(netmhc4_alleleslist, header=None, usecols=[0])[0], sep='\t') alleles_for_prediction = list(set(netmhc_alleles) & set([x.replace(':', '') for x in hla_alleles])) logger.info('Only predicting on the following HLA-alleles: ' + ','.join(sorted(set(alleles_for_prediction)))) @@ -388,7 +388,7 @@ def main(): # read combined_output file containing all neopeptides that have been evaluated by both prediction algorithms logger.info('Reading predictions from the two algorithms and evaluating binders') - np_df = pd.read_table(combined_output).drop_duplicates() + np_df = pd.read_csv(combined_output, sep='\t').drop_duplicates() ## netMHC-4.0 requires and outputs alleles in a different format; just correct the name np_df['hla_allele'] = np_df['hla_allele'].map(lambda a: reformat_hla_allele(a)) From 861dc334cc004bdb47d1a689ea6c958a5b9f3a84 Mon Sep 17 00:00:00 2001 From: evanbiederstedt Date: Fri, 2 Aug 2019 01:53:10 -0400 Subject: [PATCH 2/4] use pandas from_dict() to avoid warnings --- neoantigen.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/neoantigen.py b/neoantigen.py index f751750..59d5fd4 100644 --- a/neoantigen.py +++ b/neoantigen.py @@ -10,6 +10,7 @@ import gzip import copy from joblib import Parallel, delayed +from collections import OrderedDict ##### # Neoantigen prediction pipeline. Four main steps: @@ -456,10 +457,11 @@ def main(): maf_output.append(mut.get_maf_row_to_print()) predictions_output.extend(mut.get_predictions_rows_to_print()) - maf_output_df = pd.DataFrame.from_items([(s.name, s) for s in maf_output]).T + + maf_output_df = pd.DataFrame.from_dict(OrderedDict([s.name, s] for s in maf_output).T maf_output_df.to_csv(sample_path_pfx + '.neoantigens.maf' , sep='\t', index=False) - predictions_output_df = pd.DataFrame.from_items([(s.name, s) for s in predictions_output]).T + predictions_output_df = pd.DataFrame.from_dict(OrderedDict([s.name, s] for s in predictions_output)).T predictions_output_df.to_csv(sample_path_pfx + '.all_neoantigen_predictions.txt', sep='\t', index=False) except Exception: From e05aef4c318b8ea10a3a92fd1d7e209745408635 Mon Sep 17 00:00:00 2001 From: evanbiederstedt Date: Fri, 2 Aug 2019 02:06:39 -0400 Subject: [PATCH 3/4] missing ) --- neoantigen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neoantigen.py b/neoantigen.py index 59d5fd4..d9c692e 100644 --- a/neoantigen.py +++ b/neoantigen.py @@ -458,7 +458,7 @@ def main(): predictions_output.extend(mut.get_predictions_rows_to_print()) - maf_output_df = pd.DataFrame.from_dict(OrderedDict([s.name, s] for s in maf_output).T + maf_output_df = pd.DataFrame.from_dict(OrderedDict([s.name, s] for s in maf_output)).T maf_output_df.to_csv(sample_path_pfx + '.neoantigens.maf' , sep='\t', index=False) predictions_output_df = pd.DataFrame.from_dict(OrderedDict([s.name, s] for s in predictions_output)).T From ca65bb13e467481f38ab7e11362c97d2b9220011 Mon Sep 17 00:00:00 2001 From: evanbiederstedt Date: Fri, 2 Aug 2019 19:12:46 -0400 Subject: [PATCH 4/4] added python shebang, and correct typo with pd.read_csv --- neoantigen.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/neoantigen.py b/neoantigen.py index d9c692e..de34222 100644 --- a/neoantigen.py +++ b/neoantigen.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + from __future__ import print_function from six.moves.configparser import ConfigParser @@ -337,7 +339,7 @@ def main(): logger.info('Starting NetMHC 4.0...') ##### # For netMHC-4 prediction, only predict on alleles for which data exists - netmhc_alleles = list(pd.read_csv(netmhc4_alleleslist, header=None, usecols=[0])[0], sep='\t') + netmhc_alleles = list(pd.read_csv(netmhc4_alleleslist, header=None, usecols=[0], sep='\t')[0]) alleles_for_prediction = list(set(netmhc_alleles) & set([x.replace(':', '') for x in hla_alleles])) logger.info('Only predicting on the following HLA-alleles: ' + ','.join(sorted(set(alleles_for_prediction))))