-
Notifications
You must be signed in to change notification settings - Fork 0
/
gtf_to_junc.py
57 lines (40 loc) · 1.18 KB
/
gtf_to_junc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import os
import sys
import re
genes = dict()
geneStrands = dict()
with sys.stdin as lines:
for line in lines:
if line[0] == "#":
continue
cols = line.rstrip().split("\t")
feature = cols[2]
if feature != "exon":
continue
chrom, c1, c2, strand, data = cols[0], cols[3], cols[4], cols[6], cols[-1]
gid = re.search('gene_name "([^"]+)";', data).group(1)
tid = re.search('transcript_id "([^"]+)";', data).group(1)
if chrom not in genes:
genes[chrom] = dict()
if gid not in genes[chrom]:
genes[chrom][gid] = dict()
if tid not in genes[chrom][gid]:
genes[chrom][gid][tid] = list()
geneStrands[gid] = strand
genes[chrom][gid][tid].append((int(c1),int(c2)))
uniqueDict = dict()
for chrom, allgenes in genes.items():
for gene, txns in allgenes.items():
for tid, coords in txns.items():
if geneStrands[gene] == "-":
coords = coords[::-1]
for num, coord in enumerate(coords,0):
if num == 0 :
continue
j2, null = coord
null, j1 = coords[num-1]
j1 , j2 = j1-1, j2-1
jcnSet = (j1, j2, gene)
if jcnSet not in uniqueDict:
print(chrom, j1, j2, gene, ".", geneStrands[gene], sep="\t")
uniqueDict[jcnSet] = 1