From 70f0b567933a339509ec7f407e0b42aff8250001 Mon Sep 17 00:00:00 2001
From: Vishal N Koparde <vishal.koparde@nih.gov>
Date: Thu, 11 Feb 2021 15:03:52 -0500
Subject: [PATCH] adding viral GTF fixing script

---
 scripts/fix_refseq_gtf.py | 268 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 268 insertions(+)
 create mode 100644 scripts/fix_refseq_gtf.py

diff --git a/scripts/fix_refseq_gtf.py b/scripts/fix_refseq_gtf.py
new file mode 100644
index 0000000..130fe2a
--- /dev/null
+++ b/scripts/fix_refseq_gtf.py
@@ -0,0 +1,268 @@
+# Author: Vishal N. Koparde
+# CCBR NCI
+# Date: Aug, 2020
+
+
+import sys,copy,argparse
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-i',dest='ingtf', required=True, type=str, help="Input RefSeq GTF ..downloaded from NCBI ftp server")
+parser.add_argument('-o',dest='outgtf', required=True, type=str, help="Modified Output RefSeq GTF")
+args = parser.parse_args()
+
+def get_gene_id(column9):
+    x=column9.strip().split()
+    for i,value in enumerate(x):
+        if value=="gene_id":
+            gene_id_index=i+1
+            break
+    gene_id=x[gene_id_index]
+    return gene_id
+
+def get_gene_biotype(column9):
+    x=column9.strip().split()
+    found=0
+    for i,value in enumerate(x):
+        if value=="gene_type" or value=="gene_biotype":
+            gene_biotype_index=i+1
+            found=1
+            break
+    if found==0:
+        return '"unknown";'
+    gene_biotype=x[gene_biotype_index]
+    return gene_biotype
+
+def get_gene_name(column9):
+    x=column9.strip().split()
+    found=0
+    for i,value in enumerate(x):
+        if value=="gene" or value=="gene_name":
+            gene_index=i+1
+            found=1
+            break
+    if found==0:
+        return ""
+    gene_name=x[gene_index]
+    return gene_name
+
+def get_transcript_id(column9):
+    x=column9.strip().split()
+    found=0
+    for i,value in enumerate(x):
+        if value=="transcript_id":
+            transcript_id_index=i+1
+            found=1
+            break
+    if found==0:
+        return '"transcript_id_unknown";'
+    transcript_id=x[transcript_id_index]
+    return transcript_id
+
+def fix_transcript_id(column9,g):
+    x=column9.strip().split()
+    found=0
+    for i,value in enumerate(x):
+        if value=="transcript_id":
+            transcript_id_index=i+1
+            found=1
+            break
+    x[transcript_id_index]=g
+    if found==0:
+        x.append("transcript_id")
+        x.append(g)
+    x=" ".join(x)
+    return x   
+
+def create_new_transript_id(g,i):
+    n=g.split('"')
+    n[-2]+="_transcript_"+str(i)
+    n='"'.join(n)
+    return n
+
+def are_exons_present(transcript_lines):
+    for l in transcript_lines:
+        l_split=l.strip().split("\t")
+        if l_split[2]=="exon":
+            return True
+    else:
+        return False
+
+#create genelist
+genelist=[]
+gene_coords=dict()
+all_gtflines=list(filter(lambda x:not x.startswith("#"),open(args.ingtf).readlines()))
+blank_gene_id_lines=[]
+for f in all_gtflines:
+    its_a_gene=0
+    if f.strip().split("\t")[2]=="gene":
+        its_a_gene=1
+    gene_id=get_gene_id(f.strip().split("\t")[8])
+    if gene_id=='"";':
+        blank_gene_id_lines.append(f)
+        continue
+    genelist.append(gene_id)
+    if its_a_gene==1 and not gene_id in gene_coords:
+        gene_coords[gene_id]=(int(f.strip().split("\t")[3]),int(f.strip().split("\t")[4]))
+genelist=list(set(genelist))
+# print(genelist)
+# print(len(blank_gene_id_lines))
+
+#get genes2transcripts ... this is only for verifying that every gene has only 1 transript... this is the assumption
+gene_id_2_transcript_ids=dict()
+for g in genelist:
+    if not g in gene_id_2_transcript_ids:
+        gene_id_2_transcript_ids[g]=list()
+    lines_with_gene_id=list(filter(lambda x: g in x,all_gtflines))
+    non_gene_lines=list(filter(lambda x:x.split("\t")[2]!="gene",lines_with_gene_id))
+    for l in non_gene_lines:
+        t_id=get_transcript_id(l.strip().split("\t")[8])
+        if t_id!='"transcript_id_unknown";':
+            gene_id_2_transcript_ids[g].append(t_id)
+            gene_id_2_transcript_ids[g]=list(set(gene_id_2_transcript_ids[g]))
+
+geneid2transcriptidfile=open(args.ingtf+".geneid2transcriptid",'w')
+for k,v in gene_id_2_transcript_ids.items():
+    geneid2transcriptidfile.write("%s\t%s\n"%(k,v))
+geneid2transcriptidfile.close()
+
+#get genenames
+gene_id_2_gene_name=dict()
+for g in genelist:
+    if not g in gene_id_2_gene_name:
+        gene_id_2_gene_name[g]=list()
+    lines_with_gene_id=list(filter(lambda x: g in x,all_gtflines))
+    gene_line=list(filter(lambda x:x.split("\t")[2]=="gene",lines_with_gene_id))
+    # if len(gene_line)==0:
+    #     for l in lines_with_gene_id:
+    #         print(l,)
+    gene_line=gene_line[0]
+    gene_name=get_gene_name(gene_line.split("\t")[8])
+    if gene_name=="":
+        gene_name=g
+    gene_id_2_gene_name[g]=gene_name
+# for k,v in gene_id_2_gene_name.items():
+#     print(k,v)
+    
+#get transcript coordinates
+gene_id_2_transcript_coordinates=dict()
+for g in genelist:
+    # print("gene=",g)
+    if not g in gene_id_2_transcript_coordinates:
+        gene_id_2_transcript_coordinates[g]=list()
+    if len(gene_id_2_transcript_ids[g])==1:
+        gene_id_2_transcript_coordinates[g].append(gene_coords[g])
+    else:
+        lines_with_gene_id=list(filter(lambda x: g in x,all_gtflines))
+        non_gene_lines=list(filter(lambda x:x.split("\t")[2]!="gene",lines_with_gene_id))
+        for t in gene_id_2_transcript_ids[g]:
+            # print("transcript=",t)
+            transcript_lines=list(filter(lambda x:t in x,non_gene_lines))
+            coords=[]
+            for l in transcript_lines:
+                # print(l.strip())
+                l_split=l.split("\t")
+                coords.append(int(l_split[3]))
+                coords.append(int(l_split[4]))
+            # print()
+            gene_id_2_transcript_coordinates[g].append((min(coords),max(coords)))
+    # print(gene_id_2_transcript_coordinates[g])
+# for k,v in gene_id_2_transcript_coordinates.items():
+    # print(k,v)
+# exit()
+
+#get gene biotype\
+gene_id_2_gene_biotype=dict()
+for g in genelist:
+    lines_with_gene_id=list(filter(lambda x: g in x,all_gtflines))
+    gene_line=list(filter(lambda x:x.split("\t")[2]=="gene",lines_with_gene_id))
+    gene_line=gene_line[0]
+    gene_biotype=get_gene_biotype(gene_line.split("\t")[8])
+    gene_id_2_gene_biotype[g]=gene_biotype
+# for k,v in gene_id_2_gene_biotype.items():
+#     print(k,v)
+
+out=open(args.outgtf,'w')    
+for g in genelist:
+    lines_with_gene_id=list(filter(lambda x: g in x,all_gtflines))
+    gene_line=list(filter(lambda x:x.split("\t")[2]=="gene",lines_with_gene_id))
+    gene_line=gene_line[0]
+    gene_line=gene_line.split("\t")
+    others=gene_line.pop(-1)
+    gene_line_copy=copy.copy(gene_line)
+    # other key value pairs to add in the gene_line(col9)
+    others_to_add=[]
+    # print("others=",others)
+    for o in others.strip().split("; "):
+        # print("o=",o)
+        o2=o.split(" ")
+        # print("o2=",o2)
+        key=o2[0]
+        value=o2[1:]
+        value=" ".join(value)
+        # print("key=",key)
+        # print("value=",value)
+        if key in ["gene_id","gene","gene_name","gene_type","gene_biotype"]:
+            continue
+        else:
+            others_to_add.append(key)
+            if not ";" in value:
+                others_to_add.append(value+";")
+            else:
+                others_to_add.append(value)
+
+    col9=[]
+    col9.append("gene_id")
+    col9.append(g)
+    col9.append("gene_name")
+    col9.append(gene_id_2_gene_name[g])
+    col9.append("gene_biotype")
+    col9.append(gene_id_2_gene_biotype[g])
+    col9plus=copy.copy(col9)
+    col9plus.extend(others_to_add)
+    gene_col9=" ".join(col9plus)
+    gene_line.append(gene_col9)
+    gene_line="\t".join(gene_line)
+    out.write("%s\n"%(gene_line))
+
+    non_gene_lines=list(filter(lambda x:x.split("\t")[2]!="gene",lines_with_gene_id))
+    for i,t in enumerate(gene_id_2_transcript_ids[g]):
+        transcript_line=copy.copy(gene_line_copy)
+        transcript_line[2]="transcript"
+        transcript_line[3]=str(gene_id_2_transcript_coordinates[g][i][0])
+        transcript_line[4]=str(gene_id_2_transcript_coordinates[g][i][1])
+        new_trascript_id=create_new_transript_id(g,i+1)
+        transcript_col9=copy.copy(col9)
+        transcript_col9.append("transcript_id")
+        transcript_col9.append(new_trascript_id)
+        transcript_col9.append("transcript_name")
+        transcript_col9.append(new_trascript_id)
+        transcript_col9.append("transcript_type")
+        transcript_col9.append(gene_id_2_gene_biotype[g])
+        transcript_col9=" ".join(transcript_col9)
+        transcript_line.append(transcript_col9)
+        transcript_line="\t".join(transcript_line)   
+        out.write("%s\n"%(transcript_line))
+
+        transcript_lines=list(filter(lambda x:t in x,non_gene_lines))
+        have_exons=are_exons_present(transcript_lines)
+        for l in transcript_lines:
+            # print(l)
+            l=l.strip().split("\t")
+            tofix=l.pop(-1)
+            l.append(fix_transcript_id(tofix,new_trascript_id))
+            if l[2]=="CDS" and have_exons==False:
+                l2=copy.copy(l)
+                l2[7]="."
+                l2[2]="exon"
+                l2="\t".join(l2)
+                out.write("%s\n"%(l2))
+            l="\t".join(l)
+            out.write("%s\n"%(l))
+            # print(l)
+out.close()
+
+out=open(args.ingtf+".extralines",'w')
+for b in blank_gene_id_lines:
+    out.write(b)
+out.close()