From 72df07edab06542e368bed0b5dd52eff52ca88cd Mon Sep 17 00:00:00 2001 From: gongyixiao <5620765+gongyixiao@users.noreply.github.com> Date: Mon, 2 Dec 2019 17:15:26 -0500 Subject: [PATCH] skip header lines start with "#" Solving: https://github.com/taylor-lab/neoantigen-dev/issues/8 --- neoantigen.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/neoantigen.py b/neoantigen.py index de34222..4464cf6 100644 --- a/neoantigen.py +++ b/neoantigen.py @@ -247,7 +247,7 @@ def main(): logger.info('Finished loading reference CDS/cDNA sequences...') logger.info('Reading MAF file and constructing mutated peptides...') - maf_df = pd.read_csv(maf_file, comment='#', low_memory=False, header=0, sep="\t") + maf_df = skip_lines_start_with(maf_file, "#", low_memory=False, header=0, sep="\t") n_muts = n_non_syn_muts = n_missing_tx_id = 0 for index, row in maf_df.iterrows(): cds_seq = '' @@ -472,6 +472,19 @@ def main(): exit(1) logger.info('neoantigen-dev pipeline execution completed.\nExiting!') +# skip the header lines that start with "#" +def skip_lines_start_with(fle, junk,**kwargs): + if os.stat(fle).st_size == 0: + raise ValueError("File is empty") + with open(fle) as f: + pos = 0 + cur_line = f.readline() + while cur_line.startswith(junk): + pos = f.tell() + cur_line = f.readline() + f.seek(pos) + return pd.read_csv(f, **kwargs) + # helper function to properly re-format hla_allele def reformat_hla_allele(hla_allele): if re.match(r'HLA-\w\d\d\d\d$', hla_allele):