From 9080339689da4df9af4dda09cbccdbe0dc9cc8cb Mon Sep 17 00:00:00 2001 From: Benjamin Lee Date: Fri, 14 Jun 2019 16:02:28 -0400 Subject: [PATCH 1/9] Run 2to3 and update README --- README.md | 2 +- neoantigen.py | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 657a34a..9f3f556 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ The pipeline has four main steps: ## Install -Clone the repo and install any necessary python2 libraries from `requirements.txt`. Note that this repo is currently only compatible with Python 2.7, not Python 3.x : +Clone the repo and install any necessary Python libraries from `requirements.txt`. Note that this repo is currently only compatible with Python 3, not Python 2.x: ```bash git clone https://github.com/taylor-lab/neoantigen-dev.git diff --git a/neoantigen.py b/neoantigen.py index f179825..7ba54ab 100644 --- a/neoantigen.py +++ b/neoantigen.py @@ -8,7 +8,7 @@ import logging, logging.handlers import gzip import copy -from ConfigParser import ConfigParser +from configparser import ConfigParser from joblib import Parallel, delayed ##### @@ -90,7 +90,7 @@ def main(): peptide_lengths = [9, 10] if args.peptide_lengths is not None: - peptide_lengths = map(int, str(args.peptide_lengths).split(',')) + peptide_lengths = list(map(int, str(args.peptide_lengths).split(','))) if args.config_file is None: config_file_path = os.path.dirname(os.path.realpath(__file__)) + '/neoantigen-luna.config' @@ -98,7 +98,7 @@ def main(): config_file_path = str(args.config_file) if not os.path.exists(config_file_path): - print 'Error: could not open config file: ' + config_file_path + '. Exiting.' + print('Error: could not open config file: ' + config_file_path + '. Exiting.') exit(1) keep_tmp_files = False @@ -115,19 +115,19 @@ def main(): force_netmhc = True if args.normal_bam is None and args.hla_file is None: - print >> sys.stderr, 'Error: --normal_bam or --hla_file is required. Exiting.' + print('Error: --normal_bam or --hla_file is required. Exiting.', file=sys.stderr) exit(1) if args.normal_bam is not None and not os.path.exists(normal_bamfile): - print >> sys.stderr, 'Error: --normal_bam '' + normal_bamfile + '' does not exist. Exiting.' + print('Error: --normal_bam '' + normal_bamfile + '' does not exist. Exiting.', file=sys.stderr) exit(1) if args.hla_file is not None and not os.path.exists(hla_file): - print >> sys.stderr, 'Error: --hla_file '' + hla_file + '' does not exist. Exiting.' + print('Error: --hla_file '' + hla_file + '' does not exist. Exiting.', file=sys.stderr) exit(1) if not os.path.exists(maf_file): - print >> sys.stderr, 'Error: --maf_file '' + maf_file + '' does not exist. Exiting.' + print('Error: --maf_file '' + maf_file + '' does not exist. Exiting.', file=sys.stderr) exit(1) os.system('mkdir -p ' + output_dir) @@ -424,18 +424,18 @@ def main(): logger.info('Checking if the icore-peptide can be generated from WT sequence from the entire peptidome...') ref_aa_str = '' - for cds in cds_seqs.values(): + for cds in list(cds_seqs.values()): if cds[0:3] == 'ATG': ref_aa_str += mutation.cds_to_aa(cds) + '|' # make a list of all unique peptides - all_peptides = ({row['icore']:1 for index, row in np_df.iterrows()}).keys() + all_peptides = list(({row['icore']:1 for index, row in np_df.iterrows()}).keys()) # parallelize and search each icore peptide against the reference peptidome. Note: deliberately hard-coded 4 cores for now. results = Parallel(n_jobs=4)(delayed(find_in_reference_peptides)(all_peptides, i, 4, ref_aa_str) for i in range(1, 5)) # construct a dataframe of the peptides that are found in other protein coding genes - icore_in_reference = pd.DataFrame(data={item:1 for sublist in results for item in sublist}.keys(), columns=['icore']) + icore_in_reference = pd.DataFrame(data=list({item:1 for sublist in results for item in sublist}.keys()), columns=['icore']) icore_in_reference['is_in_wt_peptidome'] = True logger.info('...completed!') @@ -671,7 +671,7 @@ def generate_translated_sequences(self, pad_len=10): ## append the 3'UTR to the CDS -- to account for non stop mutations and indels that shift the canonical stop if not self.cds_seq in self.cdna_seq: - print 'Skipping because the CDS is not contained within cDNA. Note: only 2 transcripts/peptides are like this' + print('Skipping because the CDS is not contained within cDNA. Note: only 2 transcripts/peptides are like this') return None hgvsc = self.maf_row['HGVSc'] From cee30e47721d1e2e407c6a38c13703b70bad4dd3 Mon Sep 17 00:00:00 2001 From: Benjamin Lee Date: Fri, 14 Jun 2019 16:03:17 -0400 Subject: [PATCH 2/9] Prettify the README --- README.md | 60 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 9f3f556..3b2121d 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,14 @@ - - # neoantigen-dev + MHC Class I neoantigen prediction pipeline from IM5/IM6/WES/WGS. Takes Normal `.bam` and somatic `.maf` and generates neoantigen predictions for HLA-A/B/C. The pipeline has four main steps: -1. **Genotype HLA**. genotyping performed using POLYSOLVER. -2. **Construct mutated peptides**. For non-synonymous mutations, generates mutated peptide sequences based on `HGVSc`. _NOTE_: `.maf` file should be VEP annotated using `cmo_maf2maf --version 1.6.14 --vep-release 88` **using this EXACT VERSION**. TODO: Generate mutated sequences for fusions. -3. **Run NetMHCpan-4.0 and NetMHC-4.0**. using default parameters for each algorithm. +1. **Genotype HLA**. genotyping performed using POLYSOLVER. +2. **Construct mutated peptides**. For non-synonymous mutations, generates mutated peptide sequences based on `HGVSc`. _NOTE_: `.maf` file should be VEP annotated using `cmo_maf2maf --version 1.6.14 --vep-release 88` **using this EXACT VERSION**. TODO: Generate mutated sequences for fusions. +3. **Run NetMHCpan-4.0 and NetMHC-4.0**. using default parameters for each algorithm. 4. **Post-processing**. compiles predictions from both algorithms and finds strongest binder for each non-synonymous mutation. Also, each predicted neopeptide is searched against the entire reference peptidome to make sure it is a true neopeptide. `is_in_wt_peptidome` column reflects that. TODO: Incorporate neoantigen quality from [Lukzsa et al., Nature 2017](https://www.nature.com/articles/nature24473) - ## Install Clone the repo and install any necessary Python libraries from `requirements.txt`. Note that this repo is currently only compatible with Python 3, not Python 2.x: @@ -21,9 +19,10 @@ cd neoantigen-dev pip install -r requirements.txt ``` - ## Usage -NOTE: For POLYSOLVER step, the pipeline requires 8 cores. + +NOTE: For POLYSOLVER step, the pipeline requires 8 cores. + ``` # Neoantigen prediction pipeline. Four main steps: (1) Genotype HLA using POLYSOLVER, @@ -66,44 +65,48 @@ Optional arguments: --force_rerun_netmhc ignores any existing netMHCpan output and re-runs it. Default: false ``` + ## Output ### HLA genotypes + ``` /polysolver/winners.hla.txt ``` ### Neoantigen binding affinties annotated MAF + ``` -.neoantigens.maf. (peptide with the highest binding affinity is incorporated into the original .maf for each non-syn mutation) +.neoantigens.maf. (peptide with the highest binding affinity is incorporated into the original .maf for each non-syn mutation) .all_neoantigen_predictions.txt: all the predictions made for all peptides by both the algorithms ``` + The following columns are appended to the input `.maf`. -| Column Name | Description | -| ------------- |:-------------| -| neo_maf_identifier_key | a unique key that can be used to find other peptides predicted for the same mutation (in `.all_neoantigen_predictions.txt`) | -| neo_best_icore_peptide | neopeptide sequence for the strongest binder | -| neo_best_rank | binding rank for the strongest binder | -| neo_best_binding_affinity | binding affinity for the strongest binder | -| neo_best_binder_classification | binding classification for the strongest binder (`Non Binder`, `Strong Binder`, `Weak Binder`) | -| neo_best_is_in_wt_peptidome | `TRUE`/`FALSE` indicating whether the strongest binder peptide is in the reference peptidome | -| neo_best_algorithm | algorithm predicting the strongest binder | -| neo_best_hla_allele | hla allele for the strongest binder | -| neo_n_peptides_evaluated | total # of all peptides evaluated (unique icore peptides) | -| neo_n_strong_binders | total # of strong binders | -| neo_n_weak_binders | total # of weak binders | +| Column Name | Description | +| ------------------------------ | :-------------------------------------------------------------------------------------------------------------------------- | +| neo_maf_identifier_key | a unique key that can be used to find other peptides predicted for the same mutation (in `.all_neoantigen_predictions.txt`) | +| neo_best_icore_peptide | neopeptide sequence for the strongest binder | +| neo_best_rank | binding rank for the strongest binder | +| neo_best_binding_affinity | binding affinity for the strongest binder | +| neo_best_binder_classification | binding classification for the strongest binder (`Non Binder`, `Strong Binder`, `Weak Binder`) | +| neo_best_is_in_wt_peptidome | `TRUE`/`FALSE` indicating whether the strongest binder peptide is in the reference peptidome | +| neo_best_algorithm | algorithm predicting the strongest binder | +| neo_best_hla_allele | hla allele for the strongest binder | +| neo_n_peptides_evaluated | total # of all peptides evaluated (unique icore peptides) | +| neo_n_strong_binders | total # of strong binders | +| neo_n_weak_binders | total # of weak binders | The column description for `.all_neoantigen_predictions.txt` can be found in: [http://www.cbs.dtu.dk/services/NetMHC/output.php](http://www.cbs.dtu.dk/services/NetMHC/output.php). Additional columns are: The following columns are appended to the input `.maf`. -| Column Name | Description | -| ------------- |:-------------| -| binder_class | `Non Binder`, `Strong Binder` (`rank < 0.5 or affinity < 50`), `Weak Binder` (`rank < 2 or affinity < 500`) | -| best_binder_for_icore_group | `TRUE`/`FALSE` indicating if the binding prediction is the strongest among all the HLA-alleles/algorithms for the given icore peptide. | -| is_in_wt_peptidome | if the peptide is present in any other protein in the entire peptidome | -| neo_maf_identifier_key | a unique key that can be used to find other peptides predicted for the same mutation (in `.neoantigens.maf`) | +| Column Name | Description | +| --------------------------- | :------------------------------------------------------------------------------------------------------------------------------------- | +| binder_class | `Non Binder`, `Strong Binder` (`rank < 0.5 or affinity < 50`), `Weak Binder` (`rank < 2 or affinity < 500`) | +| best_binder_for_icore_group | `TRUE`/`FALSE` indicating if the binding prediction is the strongest among all the HLA-alleles/algorithms for the given icore peptide. | +| is_in_wt_peptidome | if the peptide is present in any other protein in the entire peptidome | +| neo_maf_identifier_key | a unique key that can be used to find other peptides predicted for the same mutation (in `.neoantigens.maf`) | ## Example @@ -114,4 +117,3 @@ python neoantigen.py --config_file neoantigen-luna.config \ --output_dir \ --maf_file ``` - From 36396e2595566542d338d4f832702b91f204a2ca Mon Sep 17 00:00:00 2001 From: Benjamin Lee Date: Fri, 14 Jun 2019 16:06:47 -0400 Subject: [PATCH 3/9] Update the travis file --- .travis.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 0d381cb..2e01ed6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,13 @@ language: python +dist: xenial # required for Python >= 3.7 python: - - 2.7 + - "3.3" + - "3.4" + - "3.5" + - "3.6" + - "3.7" + # PyPy versions + - "pypy3.5" install: - sudo apt-get -y update - pip install codecov From 0a6fa228db9094bfa283abb5a07d6428af7fe519 Mon Sep 17 00:00:00 2001 From: Benjamin Lee Date: Fri, 14 Jun 2019 17:41:36 -0400 Subject: [PATCH 4/9] Tighten the Python version requirements for travis --- .travis.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 2e01ed6..498c94b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,13 +1,9 @@ language: python dist: xenial # required for Python >= 3.7 python: - - "3.3" - - "3.4" - "3.5" - "3.6" - "3.7" - # PyPy versions - - "pypy3.5" install: - sudo apt-get -y update - pip install codecov From afe7436a84c7321364f43b2ea780949436a8d2ff Mon Sep 17 00:00:00 2001 From: evanbiederstedt Date: Fri, 14 Jun 2019 22:16:11 -0400 Subject: [PATCH 5/9] python2.7 added to travis config --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 498c94b..e4bb100 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,7 @@ language: python dist: xenial # required for Python >= 3.7 python: + - "2.7" - "3.5" - "3.6" - "3.7" From 1c694b01aad43b525f9c3b0c7d3535a74cdd0bd2 Mon Sep 17 00:00:00 2001 From: Benjamin Lee Date: Mon, 17 Jun 2019 11:16:39 -0400 Subject: [PATCH 6/9] Add compat for 2 and 3 --- neoantigen.py | 83 +++++++++++++++++++++++++----------------------- requirements.txt | 3 +- 2 files changed, 45 insertions(+), 41 deletions(-) diff --git a/neoantigen.py b/neoantigen.py index 7ba54ab..668bda3 100644 --- a/neoantigen.py +++ b/neoantigen.py @@ -8,15 +8,18 @@ import logging, logging.handlers import gzip import copy -from configparser import ConfigParser from joblib import Parallel, delayed +# support for Python 2 and 3 +from __future__ import print_function +from six.moves.configparser import ConfigParser + ##### -# Neoantigen prediction pipeline. Four main steps: -# (1) Genotype HLA using POLYSOLVER, -# (2) Constructed mutated peptide sequences from HGVSp/HGVSc +# Neoantigen prediction pipeline. Four main steps: +# (1) Genotype HLA using POLYSOLVER, +# (2) Constructed mutated peptide sequences from HGVSp/HGVSc # (3) Run NetMHC-4.0 + NetMHCpan-4.0 -# (4) Gather results and generate: +# (4) Gather results and generate: # - .neoantigens.maf: original maf with neoantigen prediction columns added # - .all_neoantigen_predictions.txt: all the predictions made for all peptides by both the algorithms ##### @@ -57,12 +60,12 @@ def main(): required=False, help='full path to normal bam file. Either --normal_bam or --hla_file are required.') - optional_arguments = parser.add_argument_group('Optional arguments') + optional_arguments = parser.add_argument_group('Optional arguments') optional_arguments.add_argument('--hla_file', required=False, help='POLYSOLVER output file (winners.hla.txt) for the sample. If not provided,' 'POLYSOLVER is run. Either --normal_bam or --hla_file are required.') - optional_arguments.add_argument('--peptide_lengths', + optional_arguments.add_argument('--peptide_lengths', required=False, help='comma-separated numbers indicating the lengths of peptides to generate. Default: 9,10') optional_arguments.add_argument('--keep_tmp_files', @@ -77,7 +80,7 @@ def main(): optional_arguments.add_argument('--force_rerun_netmhc', required=False, help='ignores any existing netMHCpan output and re-runs it. Default: false', - action='store_true') + action='store_true') args = parser.parse_args() @@ -87,7 +90,7 @@ def main(): output_dir = str(args.output_dir) hla_file = str(args.hla_file) sample_id = str(args.sample_id) - + peptide_lengths = [9, 10] if args.peptide_lengths is not None: peptide_lengths = list(map(int, str(args.peptide_lengths).split(','))) @@ -234,7 +237,7 @@ def main(): mutations = [] out_fa = open(mutated_sequences_fa, 'w') - ## generate .debug.fa for debugging purposes. + ## generate .debug.fa for debugging purposes. debug_out_fa = open(sample_path_pfx + '.mutated_sequences.debug.fa', 'w') try: @@ -242,14 +245,14 @@ def main(): cds_seqs = load_transcript_fasta(reference_cds_file) cdna_seqs = load_transcript_fasta(reference_cdna_file) logger.info('Finished loading reference CDS/cDNA sequences...') - + logger.info('Reading MAF file and constructing mutated peptides...') maf_df = pd.read_table(maf_file, comment='#', low_memory=False, header=0) n_muts = n_non_syn_muts = n_missing_tx_id = 0 for index, row in maf_df.iterrows(): cds_seq = '' cdna_seq = '' - + n_muts += 1 tx_id = row['Transcript_ID'] if tx_id in cds_seqs: @@ -265,14 +268,14 @@ def main(): if cds_seq == '': n_missing_tx_id += 1 - + if cds_seq != '' and mut.is_non_syn(): - mut.generate_translated_sequences(max(peptide_lengths)) + mut.generate_translated_sequences(max(peptide_lengths)) if len(mut.mt_altered_aa) > 5: out_fa.write('>' + mut.identifier_key + '_mut\n') out_fa.write(mut.mt_altered_aa + '\n') - + ### write out WT/MT CDS + AA for debugging purposes debug_out_fa.write('>' + mut.identifier_key + '_mut\n') debug_out_fa.write('mt_altered_aa: ' + mut.mt_altered_aa + '\n') @@ -280,7 +283,7 @@ def main(): debug_out_fa.write('wt_full_aa: ' + mut.wt_aa + '\n') debug_out_fa.write('mt_full_cds: ' + mut.mt_cds + '\n') debug_out_fa.write('mt_full_aa: ' + mut.mt_aa + '\n') - + mutations.append(mut) out_fa.close() @@ -290,7 +293,7 @@ def main(): logger.info('\t\t# mutations: ' + str(n_muts)) logger.info('\t\t# non-syn: ' + str(n_non_syn_muts) + ' (# with missing CDS: ' + str(n_missing_tx_id) + ')') - # create empty neoantigens.maf file if there are no mutations + # create empty neoantigens.maf file if there are no mutations if n_muts == 0: logger.info('No mutations in this tumor. Creating empty .neoantigens.maf file') logger.info('Exiting neoantigen pipeline') @@ -308,7 +311,7 @@ def main(): try: logger.info('') logger.info('Running MHC--peptide binding predictions using NetMHCpan 4.0...') - + netmhcpan_output_pfx = sample_path_pfx + '.netmhcpan4.output' if not force_netmhc and check_file_exists_and_not_empty(netmhcpan_output_pfx + '.txt'): logger.info('Previous run of NetMHCpan-4.0 found... Skipping!') @@ -370,7 +373,7 @@ def main(): logger.info('') logger.info('Parse NetMHC-4.0 and NetMHCpan-4.0 output....') ### - ### Parse NetMHCpan and NetMHC output into a single file with binding scores + ### Parse NetMHCpan and NetMHC output into a single file with binding scores ### parse_output_cmd = "grep -P \"^\\s*\\d+\\s*HLA\\-\" | sed -r \'s/\\s+/\\t/g\' | sed -r \'s/^\\s*//g\' | cut -f 2-4,10,12-14 | " combined_output = sample_path_pfx + '.netmhcpan_netmhc_combined.output.txt' @@ -394,7 +397,7 @@ def main(): np_df['hla_allele'] = np_df['hla_allele'].map(lambda a: reformat_hla_allele(a)) ## - ## This determination of Strong/Weak binder is based on Swanton's PMID: 30894752. + ## This determination of Strong/Weak binder is based on Swanton's PMID: 30894752. ## np_df['binder_class'] = 'non binder' ## keep the casing for 'non' as is; for sorting purpose later np_df.loc[(np_df['affinity'] < 500) | (np_df['rank'] < 2), 'binder_class'] = 'Weak Binder' @@ -402,27 +405,27 @@ def main(): ## ## for each 'peptide', multiple binding predictions will be generated for different HLA alleles and by - ## different algorithms. 'best_binder_for_icore_group' flag (True/False) represents the best binding - ## prediction across the different alleles/algorithms for a given icore. Here, we are sorting by the + ## different algorithms. 'best_binder_for_icore_group' flag (True/False) represents the best binding + ## prediction across the different alleles/algorithms for a given icore. Here, we are sorting by the ## columns below to select the one with best binding prediction and only the top row is retained. ## - np_by_peptide_df = np_df.sort_values(['binder_class', 'rank', 'affinity'], + np_by_peptide_df = np_df.sort_values(['binder_class', 'rank', 'affinity'], ascending=[True, True, True]).groupby('icore').first().reset_index() np_by_peptide_df['best_binder_for_icore_group'] = True ## annotate np_df with the 'best_binder_for_icore_group' - np_annotated_df = pd.merge(np_df, np_by_peptide_df, how='left', - on=['algorithm', 'hla_allele', 'peptide', 'core', + np_annotated_df = pd.merge(np_df, np_by_peptide_df, how='left', + on=['algorithm', 'hla_allele', 'peptide', 'core', 'icore', 'score', 'affinity', 'rank', 'binder_class']) np_annotated_df['best_binder_for_icore_group'] = np_annotated_df['best_binder_for_icore_group'].fillna(False) np_annotated_df.loc[(np_annotated_df['binder_class'] == 'non binder'), 'binder_class'] = 'Non Binder' - # For each neopeptide, we want to check whether that peptide fragment could be generated by any other WT protein + # For each neopeptide, we want to check whether that peptide fragment could be generated by any other WT protein # in the genome. Currently, optimal approach is to generate a string of the entire coding sequence in the genome # and search each neopeptide against it logger.info('Checking if the icore-peptide can be generated from WT sequence from the entire peptidome...') - + ref_aa_str = '' for cds in list(cds_seqs.values()): if cds[0:3] == 'ATG': @@ -442,8 +445,8 @@ def main(): # annotate the neopeptide dataframe with 'is_in_wt_peptidome' -- that will be written to output file np_annotated_df = pd.merge(np_annotated_df, icore_in_reference, how='left', on=['icore']) np_annotated_df['is_in_wt_peptidome'] = np_annotated_df['is_in_wt_peptidome'].fillna(False) - - # make a neopeptide object of each row. + + # make a neopeptide object of each row. all_neopeptides = [neopeptide(row) for index, row in np_annotated_df.iterrows()] # parse the mutations (with neopeptides/binding predictions) and compile output files. @@ -455,13 +458,13 @@ def main(): mut.match_with_neopeptides(all_neopeptides) maf_output.append(mut.get_maf_row_to_print()) predictions_output.extend(mut.get_predictions_rows_to_print()) - + maf_output_df = pd.DataFrame.from_items([(s.name, s) for s in maf_output]).T maf_output_df.to_csv(sample_path_pfx + '.neoantigens.maf' , sep='\t', index=False) predictions_output_df = pd.DataFrame.from_items([(s.name, s) for s in predictions_output]).T predictions_output_df.to_csv(sample_path_pfx + '.all_neoantigen_predictions.txt', sep='\t', index=False) - + except Exception: logger.error('Error while processing NetMHCpan and NetMHC output') logger.error(traceback.format_exc()) @@ -556,7 +559,7 @@ def __init__(self, row): self.binder_class = row['binder_class'] self.best_binder_for_icore_group = row['best_binder_for_icore_group'] self.is_in_wt_peptidome = row['is_in_wt_peptidome'] - + def is_strong_binder(self): if self.binder_class == 'Strong Binder': return True @@ -597,7 +600,7 @@ def get_best_binder(self): return sorted(self.get_best_per_icore(), key=lambda x: x.rank, reverse=False)[0] # -# mutation class holds each row in the maf and has +# mutation class holds each row in the maf and has # class mutation(object): maf_row = None @@ -623,10 +626,10 @@ def __init__(self, maf_row, cds_seq, cdna_seq): str(self.maf_row['End_Position']) + '_' + \ self.maf_row['Reference_Allele'] + '_' + \ self.maf_row['Tumor_Seq_Allele2'] - + ### Check if the variant_classification is among those that can generate a neoantigen def is_non_syn(self): - types = ['Frame_Shift_Del', 'Frame_Shift_Ins', 'In_Frame_Del', + types = ['Frame_Shift_Del', 'Frame_Shift_Ins', 'In_Frame_Del', 'In_Frame_Ins', 'Missense_Mutation', 'Nonstop_Mutation'] return self.maf_row['Variant_Classification'] in types and not pd.isnull(self.maf_row['HGVSp_Short']) @@ -668,7 +671,7 @@ def cds_to_aa(cds): def generate_translated_sequences(self, pad_len=10): if not self.is_non_syn(): return None - + ## append the 3'UTR to the CDS -- to account for non stop mutations and indels that shift the canonical stop if not self.cds_seq in self.cdna_seq: print('Skipping because the CDS is not contained within cDNA. Note: only 2 transcripts/peptides are like this') @@ -711,7 +714,7 @@ def generate_translated_sequences(self, pad_len=10): wt = mutation.cds_to_aa(self.wt_cds) mt = mutation.cds_to_aa(self.mt_cds) - ### identify regions of mutation in WT and MT sequences. + ### identify regions of mutation in WT and MT sequences. ### logic is to match the wt and mt sequences first from the beginning until a mismatch is found; and, then, ### start from the end of both sequences until a mismatch is found. the intervening sequence represents the WT and MT sequences ### Note, aside from missenses, the interpretation of WT sequence is ambiguous. @@ -751,8 +754,8 @@ def match_with_neopeptides(self, all_neopeptides): # make sure the neopeptide is not a peptide fragment of the wild-type protein if np.icore in self.mt_altered_aa and np.icore not in self.wt_aa: self.predicted_neopeptides.add_neopeptide(copy.deepcopy(np)) - - # simply prints the original row in the MAF file along with some neoantigen prediction specific + + # simply prints the original row in the MAF file along with some neoantigen prediction specific # appended at the end def get_maf_row_to_print(self): row = self.maf_row @@ -792,7 +795,7 @@ def get_predictions_rows_to_print(self): for prediction in self.predicted_neopeptides.neopeptides: prediction.row['neo_maf_identifier_key'] = self.identifier_key rows.append(prediction.row) - return rows + return rows if __name__ == '__main__': diff --git a/requirements.txt b/requirements.txt index 5503bf3..39e32be 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ psutil(>=5.6.2) python-dateutil(>=2.8.0) pandas(>=0.24.2) -joblib(>=0.13.2) \ No newline at end of file +joblib(>=0.13.2) +six From 0c4ca7b48b1dd07d60fce7e252f79a5167f3227c Mon Sep 17 00:00:00 2001 From: Benjamin Lee Date: Mon, 17 Jun 2019 11:17:56 -0400 Subject: [PATCH 7/9] print_function must be at beginning of file --- neoantigen.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/neoantigen.py b/neoantigen.py index 668bda3..ce078dd 100644 --- a/neoantigen.py +++ b/neoantigen.py @@ -1,4 +1,5 @@ -#!/usr/bin/env python2 +from __future__ import print_function +from six.moves.configparser import ConfigParser import os, sys, subprocess, psutil import argparse, re, errno @@ -10,9 +11,6 @@ import copy from joblib import Parallel, delayed -# support for Python 2 and 3 -from __future__ import print_function -from six.moves.configparser import ConfigParser ##### # Neoantigen prediction pipeline. Four main steps: From f4cd9e2b40b6468a871734f4d289fd8fdd89d19f Mon Sep 17 00:00:00 2001 From: Benjamin Lee Date: Mon, 17 Jun 2019 14:57:29 -0400 Subject: [PATCH 8/9] Update README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3b2121d..89da658 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ The pipeline has four main steps: ## Install -Clone the repo and install any necessary Python libraries from `requirements.txt`. Note that this repo is currently only compatible with Python 3, not Python 2.x: +Clone the repo and install any necessary Python libraries from `requirements.txt`. This repo is currently compatible with Python 2 and 3. To install: ```bash git clone https://github.com/taylor-lab/neoantigen-dev.git From c34d7c58dbe884fb5a824ef77464e45ff97a8a8b Mon Sep 17 00:00:00 2001 From: Benjamin Lee Date: Mon, 17 Jun 2019 15:47:58 -0400 Subject: [PATCH 9/9] Have to decode text into strings to do string ops --- neoantigen.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/neoantigen.py b/neoantigen.py index ce078dd..9cb598a 100644 --- a/neoantigen.py +++ b/neoantigen.py @@ -11,7 +11,6 @@ import copy from joblib import Parallel, delayed - ##### # Neoantigen prediction pipeline. Four main steps: # (1) Genotype HLA using POLYSOLVER, @@ -220,7 +219,7 @@ def main(): ## parse hla-alleles into the format that is required by NetMHC if os.path.isfile(os.path.abspath(hla_file)): - for allele in re.split('\n|\t', subprocess.check_output('cut -f 2-3 ' + hla_file, shell=True)): + for allele in re.split('\n|\t', subprocess.check_output('cut -f 2-3 ' + hla_file, shell=True).decode('utf-8')): if allele == '': continue levels = allele.split('_') @@ -245,7 +244,7 @@ def main(): logger.info('Finished loading reference CDS/cDNA sequences...') logger.info('Reading MAF file and constructing mutated peptides...') - maf_df = pd.read_table(maf_file, comment='#', low_memory=False, header=0) + maf_df = pd.read_csv(maf_file, comment='#', low_memory=False, header=0, sep="\t") n_muts = n_non_syn_muts = n_missing_tx_id = 0 for index, row in maf_df.iterrows(): cds_seq = '' @@ -511,7 +510,7 @@ def load_transcript_fasta(fa_file): idx = 0 while idx < len(lines): line = lines[idx] - m = re.search('^>(ENST\d+)\s', line) + m = re.search('^>(ENST\d+)\s', line.decode('utf-8')) transcript_id = '' if not m: sys.exit('Error parsing transcript file ' + fa_file + ' at line: ' + line) @@ -520,8 +519,8 @@ def load_transcript_fasta(fa_file): idx = idx + 1 seq_str = '' - while idx < len(lines) and not re.match('^>ENST', lines[idx]): - seq_str = seq_str + lines[idx].strip() + while idx < len(lines) and not re.match('^>ENST', lines[idx].decode('utf-8')): + seq_str = seq_str + lines[idx].decode('utf-8').strip() idx = idx + 1 seqs[transcript_id] = seq_str