From 9080339689da4df9af4dda09cbccdbe0dc9cc8cb Mon Sep 17 00:00:00 2001
From: Benjamin Lee <benjamindlee@me.com>
Date: Fri, 14 Jun 2019 16:02:28 -0400
Subject: [PATCH 1/9] Run 2to3 and update README

---
 README.md     |  2 +-
 neoantigen.py | 22 +++++++++++-----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 657a34a..9f3f556 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ The pipeline has four main steps:
 
 ## Install
 
-Clone the repo and install any necessary python2 libraries from `requirements.txt`. Note that this repo is currently only compatible with Python 2.7, not Python 3.x :
+Clone the repo and install any necessary Python libraries from `requirements.txt`. Note that this repo is currently only compatible with Python 3, not Python 2.x:
 
 ```bash
 git clone https://github.com/taylor-lab/neoantigen-dev.git
diff --git a/neoantigen.py b/neoantigen.py
index f179825..7ba54ab 100644
--- a/neoantigen.py
+++ b/neoantigen.py
@@ -8,7 +8,7 @@
 import logging, logging.handlers
 import gzip
 import copy
-from ConfigParser import ConfigParser
+from configparser import ConfigParser
 from joblib import Parallel, delayed
 
 #####
@@ -90,7 +90,7 @@ def main():
     
     peptide_lengths = [9, 10]
     if args.peptide_lengths is not None:
-        peptide_lengths = map(int, str(args.peptide_lengths).split(','))
+        peptide_lengths = list(map(int, str(args.peptide_lengths).split(',')))
 
     if args.config_file is None:
         config_file_path = os.path.dirname(os.path.realpath(__file__)) + '/neoantigen-luna.config'
@@ -98,7 +98,7 @@ def main():
         config_file_path = str(args.config_file)
 
     if not os.path.exists(config_file_path):
-        print 'Error: could not open config file: ' + config_file_path + '. Exiting.'
+        print('Error: could not open config file: ' + config_file_path + '. Exiting.')
         exit(1)
 
     keep_tmp_files = False
@@ -115,19 +115,19 @@ def main():
         force_netmhc = True
 
     if args.normal_bam is None and args.hla_file is None:
-        print >> sys.stderr, 'Error: --normal_bam or --hla_file is required. Exiting.'
+        print('Error: --normal_bam or --hla_file is required. Exiting.', file=sys.stderr)
         exit(1)
 
     if args.normal_bam is not None and not os.path.exists(normal_bamfile):
-        print >> sys.stderr, 'Error: --normal_bam '' + normal_bamfile + '' does not exist. Exiting.'
+        print('Error: --normal_bam '' + normal_bamfile + '' does not exist. Exiting.', file=sys.stderr)
         exit(1)
 
     if args.hla_file is not None and not os.path.exists(hla_file):
-        print >> sys.stderr, 'Error: --hla_file '' + hla_file + '' does not exist. Exiting.'
+        print('Error: --hla_file '' + hla_file + '' does not exist. Exiting.', file=sys.stderr)
         exit(1)
 
     if not os.path.exists(maf_file):
-        print >> sys.stderr, 'Error: --maf_file '' + maf_file + '' does not exist. Exiting.'
+        print('Error: --maf_file '' + maf_file + '' does not exist. Exiting.', file=sys.stderr)
         exit(1)
 
     os.system('mkdir -p ' + output_dir)
@@ -424,18 +424,18 @@ def main():
         logger.info('Checking if the icore-peptide can be generated from WT sequence from the entire peptidome...')
         
         ref_aa_str = ''
-        for cds in cds_seqs.values():
+        for cds in list(cds_seqs.values()):
             if cds[0:3] == 'ATG':
                 ref_aa_str += mutation.cds_to_aa(cds) + '|'
 
         # make a list of all unique peptides
-        all_peptides = ({row['icore']:1 for index, row in np_df.iterrows()}).keys()
+        all_peptides = list(({row['icore']:1 for index, row in np_df.iterrows()}).keys())
 
         # parallelize and search each icore peptide against the reference peptidome. Note: deliberately hard-coded 4 cores for now.
         results = Parallel(n_jobs=4)(delayed(find_in_reference_peptides)(all_peptides, i, 4, ref_aa_str) for i in range(1, 5))
 
         # construct a dataframe of the peptides that are found in other protein coding genes
-        icore_in_reference = pd.DataFrame(data={item:1 for sublist in results for item in sublist}.keys(), columns=['icore'])
+        icore_in_reference = pd.DataFrame(data=list({item:1 for sublist in results for item in sublist}.keys()), columns=['icore'])
         icore_in_reference['is_in_wt_peptidome'] = True
         logger.info('...completed!')
 
@@ -671,7 +671,7 @@ def generate_translated_sequences(self, pad_len=10):
         
         ## append the 3'UTR to the CDS -- to account for non stop mutations and indels that shift the canonical stop
         if not self.cds_seq in self.cdna_seq:
-            print 'Skipping because the CDS is not contained within cDNA. Note: only 2 transcripts/peptides are like this'
+            print('Skipping because the CDS is not contained within cDNA. Note: only 2 transcripts/peptides are like this')
             return None
 
         hgvsc = self.maf_row['HGVSc']

From cee30e47721d1e2e407c6a38c13703b70bad4dd3 Mon Sep 17 00:00:00 2001
From: Benjamin Lee <benjamindlee@me.com>
Date: Fri, 14 Jun 2019 16:03:17 -0400
Subject: [PATCH 2/9] Prettify the README

---
 README.md | 60 ++++++++++++++++++++++++++++---------------------------
 1 file changed, 31 insertions(+), 29 deletions(-)

diff --git a/README.md b/README.md
index 9f3f556..3b2121d 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,14 @@
-
-
 # neoantigen-dev
+
 MHC Class I neoantigen prediction pipeline from IM5/IM6/WES/WGS. Takes Normal `.bam` and somatic `.maf` and generates neoantigen predictions for HLA-A/B/C.
 
 The pipeline has four main steps:
 
-1. **Genotype HLA**. genotyping performed using POLYSOLVER. 
-2. **Construct mutated peptides**.  For non-synonymous mutations, generates mutated peptide sequences based on `HGVSc`.  _NOTE_: `.maf` file should be VEP annotated using `cmo_maf2maf  --version 1.6.14 --vep-release 88` **using this EXACT VERSION**. TODO: Generate mutated sequences for fusions.
-3. **Run NetMHCpan-4.0 and NetMHC-4.0**. using default parameters for each algorithm. 
+1. **Genotype HLA**. genotyping performed using POLYSOLVER.
+2. **Construct mutated peptides**. For non-synonymous mutations, generates mutated peptide sequences based on `HGVSc`. _NOTE_: `.maf` file should be VEP annotated using `cmo_maf2maf --version 1.6.14 --vep-release 88` **using this EXACT VERSION**. TODO: Generate mutated sequences for fusions.
+3. **Run NetMHCpan-4.0 and NetMHC-4.0**. using default parameters for each algorithm.
 4. **Post-processing**. compiles predictions from both algorithms and finds strongest binder for each non-synonymous mutation. Also, each predicted neopeptide is searched against the entire reference peptidome to make sure it is a true neopeptide. `is_in_wt_peptidome` column reflects that. TODO: Incorporate neoantigen quality from [Lukzsa et al., Nature 2017](https://www.nature.com/articles/nature24473)
 
-
 ## Install
 
 Clone the repo and install any necessary Python libraries from `requirements.txt`. Note that this repo is currently only compatible with Python 3, not Python 2.x:
@@ -21,9 +19,10 @@ cd neoantigen-dev
 pip install -r requirements.txt
 ```
 
-
 ## Usage
-NOTE: For POLYSOLVER step, the pipeline requires 8 cores. 
+
+NOTE: For POLYSOLVER step, the pipeline requires 8 cores.
+
 ```
 # Neoantigen prediction pipeline. Four main steps:
 		(1) Genotype HLA using POLYSOLVER,
@@ -66,44 +65,48 @@ Optional arguments:
   --force_rerun_netmhc  ignores any existing netMHCpan output and re-runs it.
                         Default: false
 ```
+
 ## Output
 
 ### HLA genotypes
+
 ```
 <output_dir>/polysolver/winners.hla.txt
 ```
 
 ### Neoantigen binding affinties annotated MAF
+
 ```
-<sample_id>.neoantigens.maf. (peptide with the highest binding affinity is incorporated into the original .maf for each non-syn mutation) 
+<sample_id>.neoantigens.maf. (peptide with the highest binding affinity is incorporated into the original .maf for each non-syn mutation)
 <sample_id>.all_neoantigen_predictions.txt: all the predictions made for all peptides by both the algorithms
 ```
+
 The following columns are appended to the input `.maf`.
 
-| Column Name        | Description           |
-| ------------- |:-------------|
-| neo_maf_identifier_key      | a unique key that can be used to find other peptides predicted for the same mutation (in `.all_neoantigen_predictions.txt`)  |
-| neo_best_icore_peptide | neopeptide sequence for the strongest binder | 
-| neo_best_rank | binding rank for the strongest binder | 
-| neo_best_binding_affinity | binding affinity for the strongest binder | 
-| neo_best_binder_classification | binding classification for the strongest binder (`Non Binder`, `Strong Binder`, `Weak Binder`) | 
-| neo_best_is_in_wt_peptidome |  `TRUE`/`FALSE` indicating whether the strongest binder peptide is in the reference peptidome | 
-| neo_best_algorithm | algorithm predicting the strongest binder | 
-| neo_best_hla_allele | hla allele for the strongest binder | 
-| neo_n_peptides_evaluated | total # of all peptides evaluated (unique icore peptides) | 
-| neo_n_strong_binders |  total # of strong binders | 
-| neo_n_weak_binders | total # of weak binders |
+| Column Name                    | Description                                                                                                                 |
+| ------------------------------ | :-------------------------------------------------------------------------------------------------------------------------- |
+| neo_maf_identifier_key         | a unique key that can be used to find other peptides predicted for the same mutation (in `.all_neoantigen_predictions.txt`) |
+| neo_best_icore_peptide         | neopeptide sequence for the strongest binder                                                                                |
+| neo_best_rank                  | binding rank for the strongest binder                                                                                       |
+| neo_best_binding_affinity      | binding affinity for the strongest binder                                                                                   |
+| neo_best_binder_classification | binding classification for the strongest binder (`Non Binder`, `Strong Binder`, `Weak Binder`)                              |
+| neo_best_is_in_wt_peptidome    | `TRUE`/`FALSE` indicating whether the strongest binder peptide is in the reference peptidome                                |
+| neo_best_algorithm             | algorithm predicting the strongest binder                                                                                   |
+| neo_best_hla_allele            | hla allele for the strongest binder                                                                                         |
+| neo_n_peptides_evaluated       | total # of all peptides evaluated (unique icore peptides)                                                                   |
+| neo_n_strong_binders           | total # of strong binders                                                                                                   |
+| neo_n_weak_binders             | total # of weak binders                                                                                                     |
 
 The column description for `.all_neoantigen_predictions.txt` can be found in: [http://www.cbs.dtu.dk/services/NetMHC/output.php](http://www.cbs.dtu.dk/services/NetMHC/output.php). Additional columns are:
 
 The following columns are appended to the input `.maf`.
 
-| Column Name        | Description           |
-| ------------- |:-------------|
-| binder_class      | `Non Binder`, `Strong Binder` (`rank < 0.5 or affinity < 50`), `Weak Binder` (`rank < 2 or affinity < 500`) |
-| best_binder_for_icore_group | `TRUE`/`FALSE` indicating if the binding prediction is the strongest among all the HLA-alleles/algorithms for the given icore peptide. | 
-| is_in_wt_peptidome | if the peptide is present in any other protein in the entire peptidome | 
-| neo_maf_identifier_key | a unique key that can be used to find other peptides predicted for the same mutation (in `.neoantigens.maf`)  | 
+| Column Name                 | Description                                                                                                                            |
+| --------------------------- | :------------------------------------------------------------------------------------------------------------------------------------- |
+| binder_class                | `Non Binder`, `Strong Binder` (`rank < 0.5 or affinity < 50`), `Weak Binder` (`rank < 2 or affinity < 500`)                            |
+| best_binder_for_icore_group | `TRUE`/`FALSE` indicating if the binding prediction is the strongest among all the HLA-alleles/algorithms for the given icore peptide. |
+| is_in_wt_peptidome          | if the peptide is present in any other protein in the entire peptidome                                                                 |
+| neo_maf_identifier_key      | a unique key that can be used to find other peptides predicted for the same mutation (in `.neoantigens.maf`)                           |
 
 ## Example
 
@@ -114,4 +117,3 @@ python neoantigen.py --config_file neoantigen-luna.config \
                      --output_dir <output_dir> \
                      --maf_file <cmo_vep_annotated_maf_file>
 ```
-

From 36396e2595566542d338d4f832702b91f204a2ca Mon Sep 17 00:00:00 2001
From: Benjamin Lee <benjamindlee@me.com>
Date: Fri, 14 Jun 2019 16:06:47 -0400
Subject: [PATCH 3/9] Update the travis file

---
 .travis.yml | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 0d381cb..2e01ed6 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,6 +1,13 @@
 language: python
+dist: xenial   # required for Python >= 3.7
 python:
-    - 2.7
+    - "3.3"
+    - "3.4"
+    - "3.5"
+    - "3.6"
+    - "3.7"
+    # PyPy versions
+    - "pypy3.5"
 install:
     - sudo apt-get -y update
     - pip install codecov

From 0a6fa228db9094bfa283abb5a07d6428af7fe519 Mon Sep 17 00:00:00 2001
From: Benjamin Lee <benjamindlee@me.com>
Date: Fri, 14 Jun 2019 17:41:36 -0400
Subject: [PATCH 4/9] Tighten the Python version requirements for travis

---
 .travis.yml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 2e01ed6..498c94b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,13 +1,9 @@
 language: python
 dist: xenial   # required for Python >= 3.7
 python:
-    - "3.3"
-    - "3.4"
     - "3.5"
     - "3.6"
     - "3.7"
-    # PyPy versions
-    - "pypy3.5"
 install:
     - sudo apt-get -y update
     - pip install codecov

From afe7436a84c7321364f43b2ea780949436a8d2ff Mon Sep 17 00:00:00 2001
From: evanbiederstedt <evan.biederstedt@gmail.com>
Date: Fri, 14 Jun 2019 22:16:11 -0400
Subject: [PATCH 5/9] python2.7 added to travis config

---
 .travis.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.travis.yml b/.travis.yml
index 498c94b..e4bb100 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,6 +1,7 @@
 language: python
 dist: xenial   # required for Python >= 3.7
 python:
+    - "2.7"
     - "3.5"
     - "3.6"
     - "3.7"

From 1c694b01aad43b525f9c3b0c7d3535a74cdd0bd2 Mon Sep 17 00:00:00 2001
From: Benjamin Lee <benjamindlee@me.com>
Date: Mon, 17 Jun 2019 11:16:39 -0400
Subject: [PATCH 6/9] Add compat for 2 and 3

---
 neoantigen.py    | 83 +++++++++++++++++++++++++-----------------------
 requirements.txt |  3 +-
 2 files changed, 45 insertions(+), 41 deletions(-)

diff --git a/neoantigen.py b/neoantigen.py
index 7ba54ab..668bda3 100644
--- a/neoantigen.py
+++ b/neoantigen.py
@@ -8,15 +8,18 @@
 import logging, logging.handlers
 import gzip
 import copy
-from configparser import ConfigParser
 from joblib import Parallel, delayed
 
+# support for Python 2 and 3
+from __future__ import print_function
+from six.moves.configparser import ConfigParser
+
 #####
-# Neoantigen prediction pipeline. Four main steps: 
-#       (1) Genotype HLA using POLYSOLVER, 
-#       (2) Constructed mutated peptide sequences from HGVSp/HGVSc 
+# Neoantigen prediction pipeline. Four main steps:
+#       (1) Genotype HLA using POLYSOLVER,
+#       (2) Constructed mutated peptide sequences from HGVSp/HGVSc
 #       (3) Run NetMHC-4.0 + NetMHCpan-4.0
-#       (4) Gather results and generate: 
+#       (4) Gather results and generate:
 #               - <sample_id>.neoantigens.maf: original maf with neoantigen prediction columns added
 #               - <sample_id>.all_neoantigen_predictions.txt: all the predictions made for all peptides by both the algorithms
 #####
@@ -57,12 +60,12 @@ def main():
                         required=False,
                         help='full path to normal bam file. Either --normal_bam or --hla_file are required.')
 
-    optional_arguments = parser.add_argument_group('Optional arguments')                        
+    optional_arguments = parser.add_argument_group('Optional arguments')
     optional_arguments.add_argument('--hla_file',
                         required=False,
                         help='POLYSOLVER output file (winners.hla.txt) for the sample. If not provided,'
                              'POLYSOLVER is run. Either --normal_bam or --hla_file are required.')
-    optional_arguments.add_argument('--peptide_lengths', 
+    optional_arguments.add_argument('--peptide_lengths',
                         required=False,
                         help='comma-separated numbers indicating the lengths of peptides to generate. Default: 9,10')
     optional_arguments.add_argument('--keep_tmp_files',
@@ -77,7 +80,7 @@ def main():
     optional_arguments.add_argument('--force_rerun_netmhc',
                         required=False,
                         help='ignores any existing netMHCpan output and re-runs it. Default: false',
-                        action='store_true')                        
+                        action='store_true')
 
 
     args = parser.parse_args()
@@ -87,7 +90,7 @@ def main():
     output_dir = str(args.output_dir)
     hla_file = str(args.hla_file)
     sample_id = str(args.sample_id)
-    
+
     peptide_lengths = [9, 10]
     if args.peptide_lengths is not None:
         peptide_lengths = list(map(int, str(args.peptide_lengths).split(',')))
@@ -234,7 +237,7 @@ def main():
     mutations = []
     out_fa = open(mutated_sequences_fa, 'w')
 
-    ## generate .debug.fa for debugging purposes. 
+    ## generate .debug.fa for debugging purposes.
     debug_out_fa = open(sample_path_pfx + '.mutated_sequences.debug.fa', 'w')
 
     try:
@@ -242,14 +245,14 @@ def main():
         cds_seqs = load_transcript_fasta(reference_cds_file)
         cdna_seqs = load_transcript_fasta(reference_cdna_file)
         logger.info('Finished loading reference CDS/cDNA sequences...')
-        
+
         logger.info('Reading MAF file and constructing mutated peptides...')
         maf_df = pd.read_table(maf_file, comment='#', low_memory=False, header=0)
         n_muts = n_non_syn_muts = n_missing_tx_id = 0
         for index, row in maf_df.iterrows():
             cds_seq = ''
             cdna_seq = ''
-            
+
             n_muts += 1
             tx_id = row['Transcript_ID']
             if tx_id in cds_seqs:
@@ -265,14 +268,14 @@ def main():
 
             if cds_seq == '':
                 n_missing_tx_id += 1
-                
+
             if cds_seq != '' and mut.is_non_syn():
-                mut.generate_translated_sequences(max(peptide_lengths))         
+                mut.generate_translated_sequences(max(peptide_lengths))
 
             if len(mut.mt_altered_aa) > 5:
                 out_fa.write('>' + mut.identifier_key + '_mut\n')
                 out_fa.write(mut.mt_altered_aa + '\n')
-                
+
                 ### write out WT/MT CDS + AA for debugging purposes
                 debug_out_fa.write('>' + mut.identifier_key + '_mut\n')
                 debug_out_fa.write('mt_altered_aa: ' + mut.mt_altered_aa + '\n')
@@ -280,7 +283,7 @@ def main():
                 debug_out_fa.write('wt_full_aa: ' + mut.wt_aa + '\n')
                 debug_out_fa.write('mt_full_cds: ' + mut.mt_cds + '\n')
                 debug_out_fa.write('mt_full_aa: ' + mut.mt_aa + '\n')
-                
+
 
             mutations.append(mut)
         out_fa.close()
@@ -290,7 +293,7 @@ def main():
         logger.info('\t\t# mutations: ' + str(n_muts))
         logger.info('\t\t# non-syn: ' + str(n_non_syn_muts) + ' (# with missing CDS: ' + str(n_missing_tx_id) + ')')
 
-        # create empty neoantigens.maf file if there are no mutations 
+        # create empty neoantigens.maf file if there are no mutations
         if n_muts == 0:
             logger.info('No mutations in this tumor. Creating empty .neoantigens.maf file')
             logger.info('Exiting neoantigen pipeline')
@@ -308,7 +311,7 @@ def main():
     try:
         logger.info('')
         logger.info('Running MHC--peptide binding predictions using NetMHCpan 4.0...')
-    
+
         netmhcpan_output_pfx = sample_path_pfx + '.netmhcpan4.output'
         if not force_netmhc and check_file_exists_and_not_empty(netmhcpan_output_pfx + '.txt'):
             logger.info('Previous run of NetMHCpan-4.0 found... Skipping!')
@@ -370,7 +373,7 @@ def main():
         logger.info('')
         logger.info('Parse NetMHC-4.0 and NetMHCpan-4.0 output....')
         ###
-        ### Parse NetMHCpan and NetMHC output into a single file with binding scores 
+        ### Parse NetMHCpan and NetMHC output into a single file with binding scores
         ###
         parse_output_cmd = "grep -P \"^\\s*\\d+\\s*HLA\\-\"  | sed -r \'s/\\s+/\\t/g\' | sed -r \'s/^\\s*//g\' | cut -f 2-4,10,12-14 | "
         combined_output = sample_path_pfx + '.netmhcpan_netmhc_combined.output.txt'
@@ -394,7 +397,7 @@ def main():
         np_df['hla_allele'] = np_df['hla_allele'].map(lambda a: reformat_hla_allele(a))
 
         ##
-        ## This determination of Strong/Weak binder is based on Swanton's PMID: 30894752. 
+        ## This determination of Strong/Weak binder is based on Swanton's PMID: 30894752.
         ##
         np_df['binder_class'] = 'non binder' ## keep the casing for 'non' as is; for sorting purpose later
         np_df.loc[(np_df['affinity'] < 500) | (np_df['rank'] < 2), 'binder_class'] = 'Weak Binder'
@@ -402,27 +405,27 @@ def main():
 
         ##
         ## for each 'peptide', multiple binding predictions will be generated for different HLA alleles and by
-        ## different algorithms. 'best_binder_for_icore_group' flag (True/False) represents the best binding 
-        ## prediction across the different alleles/algorithms for a given icore. Here, we are sorting by the 
+        ## different algorithms. 'best_binder_for_icore_group' flag (True/False) represents the best binding
+        ## prediction across the different alleles/algorithms for a given icore. Here, we are sorting by the
         ## columns below to select the one with best binding prediction and only the top row is retained.
         ##
-        np_by_peptide_df = np_df.sort_values(['binder_class', 'rank', 'affinity'], 
+        np_by_peptide_df = np_df.sort_values(['binder_class', 'rank', 'affinity'],
                                     ascending=[True, True, True]).groupby('icore').first().reset_index()
         np_by_peptide_df['best_binder_for_icore_group'] = True
 
         ## annotate np_df with the 'best_binder_for_icore_group'
-        np_annotated_df = pd.merge(np_df, np_by_peptide_df, how='left', 
-                        on=['algorithm', 'hla_allele', 'peptide', 'core', 
+        np_annotated_df = pd.merge(np_df, np_by_peptide_df, how='left',
+                        on=['algorithm', 'hla_allele', 'peptide', 'core',
                             'icore', 'score', 'affinity', 'rank', 'binder_class'])
 
         np_annotated_df['best_binder_for_icore_group'] = np_annotated_df['best_binder_for_icore_group'].fillna(False)
         np_annotated_df.loc[(np_annotated_df['binder_class'] == 'non binder'), 'binder_class'] = 'Non Binder'
 
-        # For each neopeptide, we want to check whether that peptide fragment could be generated by any other WT protein 
+        # For each neopeptide, we want to check whether that peptide fragment could be generated by any other WT protein
         # in the genome. Currently, optimal approach is to generate a string of the entire coding sequence in the genome
         # and search each neopeptide against it
         logger.info('Checking if the icore-peptide can be generated from WT sequence from the entire peptidome...')
-        
+
         ref_aa_str = ''
         for cds in list(cds_seqs.values()):
             if cds[0:3] == 'ATG':
@@ -442,8 +445,8 @@ def main():
         # annotate the neopeptide dataframe with 'is_in_wt_peptidome' -- that will be written to output file
         np_annotated_df = pd.merge(np_annotated_df, icore_in_reference, how='left', on=['icore'])
         np_annotated_df['is_in_wt_peptidome'] = np_annotated_df['is_in_wt_peptidome'].fillna(False)
-        
-        # make a neopeptide object of each row. 
+
+        # make a neopeptide object of each row.
         all_neopeptides = [neopeptide(row) for index, row in np_annotated_df.iterrows()]
 
         # parse the mutations (with neopeptides/binding predictions) and compile output files.
@@ -455,13 +458,13 @@ def main():
             mut.match_with_neopeptides(all_neopeptides)
             maf_output.append(mut.get_maf_row_to_print())
             predictions_output.extend(mut.get_predictions_rows_to_print())
-            
+
         maf_output_df = pd.DataFrame.from_items([(s.name, s) for s in maf_output]).T
         maf_output_df.to_csv(sample_path_pfx + '.neoantigens.maf' , sep='\t', index=False)
 
         predictions_output_df = pd.DataFrame.from_items([(s.name, s) for s in predictions_output]).T
         predictions_output_df.to_csv(sample_path_pfx + '.all_neoantigen_predictions.txt', sep='\t', index=False)
-        
+
     except Exception:
         logger.error('Error while processing NetMHCpan and NetMHC output')
         logger.error(traceback.format_exc())
@@ -556,7 +559,7 @@ def __init__(self, row):
         self.binder_class = row['binder_class']
         self.best_binder_for_icore_group = row['best_binder_for_icore_group']
         self.is_in_wt_peptidome = row['is_in_wt_peptidome']
-              
+
     def is_strong_binder(self):
         if self.binder_class == 'Strong Binder':
             return True
@@ -597,7 +600,7 @@ def get_best_binder(self):
         return sorted(self.get_best_per_icore(), key=lambda x: x.rank, reverse=False)[0]
 
 #
-# mutation class holds each row in the maf and has 
+# mutation class holds each row in the maf and has
 #
 class mutation(object):
     maf_row = None
@@ -623,10 +626,10 @@ def __init__(self, maf_row, cds_seq, cdna_seq):
                               str(self.maf_row['End_Position']) + '_' + \
                               self.maf_row['Reference_Allele'] + '_' + \
                               self.maf_row['Tumor_Seq_Allele2']
-        
+
     ### Check if the variant_classification is among those that can generate a neoantigen
     def is_non_syn(self):
-        types = ['Frame_Shift_Del', 'Frame_Shift_Ins', 'In_Frame_Del', 
+        types = ['Frame_Shift_Del', 'Frame_Shift_Ins', 'In_Frame_Del',
                 'In_Frame_Ins', 'Missense_Mutation', 'Nonstop_Mutation']
 
         return self.maf_row['Variant_Classification'] in types and not pd.isnull(self.maf_row['HGVSp_Short'])
@@ -668,7 +671,7 @@ def cds_to_aa(cds):
     def generate_translated_sequences(self, pad_len=10):
         if not self.is_non_syn():
             return None
-        
+
         ## append the 3'UTR to the CDS -- to account for non stop mutations and indels that shift the canonical stop
         if not self.cds_seq in self.cdna_seq:
             print('Skipping because the CDS is not contained within cDNA. Note: only 2 transcripts/peptides are like this')
@@ -711,7 +714,7 @@ def generate_translated_sequences(self, pad_len=10):
         wt = mutation.cds_to_aa(self.wt_cds)
         mt = mutation.cds_to_aa(self.mt_cds)
 
-        ### identify regions of mutation in WT and MT sequences. 
+        ### identify regions of mutation in WT and MT sequences.
         ### logic is to match the wt and mt sequences first from the beginning until a mismatch is found; and, then,
         ### start from the end of both sequences until a mismatch is found. the intervening sequence represents the WT and MT sequences
         ### Note, aside from missenses, the interpretation of WT sequence is ambiguous.
@@ -751,8 +754,8 @@ def match_with_neopeptides(self, all_neopeptides):
             # make sure the neopeptide is not a peptide fragment of the wild-type protein
             if np.icore in self.mt_altered_aa and np.icore not in self.wt_aa:
                 self.predicted_neopeptides.add_neopeptide(copy.deepcopy(np))
-    
-    # simply prints the original row in the MAF file along with some neoantigen prediction specific 
+
+    # simply prints the original row in the MAF file along with some neoantigen prediction specific
     # appended at the end
     def get_maf_row_to_print(self):
         row = self.maf_row
@@ -792,7 +795,7 @@ def get_predictions_rows_to_print(self):
         for prediction in self.predicted_neopeptides.neopeptides:
             prediction.row['neo_maf_identifier_key'] = self.identifier_key
             rows.append(prediction.row)
-        return rows           
+        return rows
 
 
 if __name__ == '__main__':
diff --git a/requirements.txt b/requirements.txt
index 5503bf3..39e32be 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 psutil(>=5.6.2)
 python-dateutil(>=2.8.0)
 pandas(>=0.24.2)
-joblib(>=0.13.2)
\ No newline at end of file
+joblib(>=0.13.2)
+six

From 0c4ca7b48b1dd07d60fce7e252f79a5167f3227c Mon Sep 17 00:00:00 2001
From: Benjamin Lee <benjamindlee@me.com>
Date: Mon, 17 Jun 2019 11:17:56 -0400
Subject: [PATCH 7/9] print_function must be at beginning of file

---
 neoantigen.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/neoantigen.py b/neoantigen.py
index 668bda3..ce078dd 100644
--- a/neoantigen.py
+++ b/neoantigen.py
@@ -1,4 +1,5 @@
-#!/usr/bin/env python2
+from __future__ import print_function
+from six.moves.configparser import ConfigParser
 
 import os, sys, subprocess, psutil
 import argparse, re, errno
@@ -10,9 +11,6 @@
 import copy
 from joblib import Parallel, delayed
 
-# support for Python 2 and 3
-from __future__ import print_function
-from six.moves.configparser import ConfigParser
 
 #####
 # Neoantigen prediction pipeline. Four main steps:

From f4cd9e2b40b6468a871734f4d289fd8fdd89d19f Mon Sep 17 00:00:00 2001
From: Benjamin Lee <benjamindlee@me.com>
Date: Mon, 17 Jun 2019 14:57:29 -0400
Subject: [PATCH 8/9] Update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3b2121d..89da658 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ The pipeline has four main steps:
 
 ## Install
 
-Clone the repo and install any necessary Python libraries from `requirements.txt`. Note that this repo is currently only compatible with Python 3, not Python 2.x:
+Clone the repo and install any necessary Python libraries from `requirements.txt`. This repo is currently compatible with Python 2 and 3. To install:
 
 ```bash
 git clone https://github.com/taylor-lab/neoantigen-dev.git

From c34d7c58dbe884fb5a824ef77464e45ff97a8a8b Mon Sep 17 00:00:00 2001
From: Benjamin Lee <benjamindlee@me.com>
Date: Mon, 17 Jun 2019 15:47:58 -0400
Subject: [PATCH 9/9] Have to decode text into strings to do string ops

---
 neoantigen.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/neoantigen.py b/neoantigen.py
index ce078dd..9cb598a 100644
--- a/neoantigen.py
+++ b/neoantigen.py
@@ -11,7 +11,6 @@
 import copy
 from joblib import Parallel, delayed
 
-
 #####
 # Neoantigen prediction pipeline. Four main steps:
 #       (1) Genotype HLA using POLYSOLVER,
@@ -220,7 +219,7 @@ def main():
 
     ## parse hla-alleles into the format that is required by NetMHC
     if os.path.isfile(os.path.abspath(hla_file)):
-        for allele in re.split('\n|\t', subprocess.check_output('cut -f 2-3 ' + hla_file, shell=True)):
+        for allele in re.split('\n|\t', subprocess.check_output('cut -f 2-3 ' + hla_file, shell=True).decode('utf-8')):
             if allele == '':
                 continue
             levels = allele.split('_')
@@ -245,7 +244,7 @@ def main():
         logger.info('Finished loading reference CDS/cDNA sequences...')
 
         logger.info('Reading MAF file and constructing mutated peptides...')
-        maf_df = pd.read_table(maf_file, comment='#', low_memory=False, header=0)
+        maf_df = pd.read_csv(maf_file, comment='#', low_memory=False, header=0, sep="\t")
         n_muts = n_non_syn_muts = n_missing_tx_id = 0
         for index, row in maf_df.iterrows():
             cds_seq = ''
@@ -511,7 +510,7 @@ def load_transcript_fasta(fa_file):
     idx = 0
     while idx < len(lines):
         line = lines[idx]
-        m = re.search('^>(ENST\d+)\s', line)
+        m = re.search('^>(ENST\d+)\s', line.decode('utf-8'))
         transcript_id = ''
         if not m:
             sys.exit('Error parsing transcript file ' + fa_file + ' at line: ' + line)
@@ -520,8 +519,8 @@ def load_transcript_fasta(fa_file):
 
         idx = idx + 1
         seq_str = ''
-        while idx < len(lines) and not re.match('^>ENST', lines[idx]):
-            seq_str = seq_str + lines[idx].strip()
+        while idx < len(lines) and not re.match('^>ENST', lines[idx].decode('utf-8')):
+            seq_str = seq_str + lines[idx].decode('utf-8').strip()
             idx = idx + 1
             seqs[transcript_id] = seq_str