From 72df07edab06542e368bed0b5dd52eff52ca88cd Mon Sep 17 00:00:00 2001
From: gongyixiao <5620765+gongyixiao@users.noreply.github.com>
Date: Mon, 2 Dec 2019 17:15:26 -0500
Subject: [PATCH] skip header lines start with "#"

Solving: https://github.com/taylor-lab/neoantigen-dev/issues/8
---
 neoantigen.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/neoantigen.py b/neoantigen.py
index de34222..4464cf6 100644
--- a/neoantigen.py
+++ b/neoantigen.py
@@ -247,7 +247,7 @@ def main():
         logger.info('Finished loading reference CDS/cDNA sequences...')
 
         logger.info('Reading MAF file and constructing mutated peptides...')
-        maf_df = pd.read_csv(maf_file, comment='#', low_memory=False, header=0, sep="\t")
+        maf_df = skip_lines_start_with(maf_file, "#", low_memory=False, header=0, sep="\t")
         n_muts = n_non_syn_muts = n_missing_tx_id = 0
         for index, row in maf_df.iterrows():
             cds_seq = ''
@@ -472,6 +472,19 @@ def main():
         exit(1)
     logger.info('neoantigen-dev pipeline execution completed.\nExiting!')
 
+# skip the header lines that start with "#"    
+def skip_lines_start_with(fle, junk,**kwargs):
+    if os.stat(fle).st_size == 0:
+        raise ValueError("File is empty")
+    with open(fle) as f:
+        pos = 0
+        cur_line = f.readline()
+        while cur_line.startswith(junk):
+            pos = f.tell()
+            cur_line = f.readline()
+        f.seek(pos)
+        return pd.read_csv(f, **kwargs)
+    
 # helper function to properly re-format hla_allele
 def reformat_hla_allele(hla_allele):
     if re.match(r'HLA-\w\d\d\d\d$', hla_allele):