-
Notifications
You must be signed in to change notification settings - Fork 0
/
bed_to_gff_converter.py
78 lines (73 loc) · 3.07 KB
/
bed_to_gff_converter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python2
# This code exists in 2 places: ~/datatypes/converters and ~/tools/filters
import sys
assert sys.version_info[:2] >= ( 2, 4 )
def __main__():
input_name = sys.argv[1]
output_name = sys.argv[2]
print (input_name,output_name)
skipped_lines = 0
first_skipped_line = 0
out = open( output_name, 'w' )
out.write( "##gff-version 2\n" )
out.write( "##bed_to_gff_converter.py\n" )
i = 0
#for i, line in ( input_name ):
# print (i)
for i, line in enumerate( file( input_name ) ):
complete_bed = False
line = line.rstrip( '\r\n' )
if line and not line.startswith( '#' ) and not line.startswith( 'track' ) and not line.startswith( 'browser' ):
try:
elems = line.split( '\t' )
if len( elems ) == 12:
complete_bed = True
chrom = elems[0]
if complete_bed:
feature = "mRNA"
else:
try:
feature = elems[3]
except:
feature = 'feature%d' % ( i + 1 )
start = int( elems[1] ) + 1
end = int( elems[2] )
try:
score = elems[4]
except:
score = '0'
try:
strand = elems[5]
except:
strand = '+'
try:
group = elems[3]
except:
group = 'group%d' % ( i + 1 )
if complete_bed:
out.write( '%s\tbed2gff\t%s\t%d\t%d\t%s\t%s\t.\t%s %s;\n' % ( chrom, feature, start, end, score, strand, feature, group ) )
else:
out.write( '%s\tbed2gff\t%s\t%d\t%d\t%s\t%s\t.\t%s;\n' % ( chrom, feature, start, end, score, strand, group ) )
if complete_bed:
# We have all the info necessary to annotate exons for genes and mRNAs
block_count = int( elems[9] )
block_sizes = elems[10].split( ',' )
block_starts = elems[11].split( ',' )
for j in range( block_count ):
exon_start = int( start ) + int( block_starts[j] )
exon_end = exon_start + int( block_sizes[j] ) - 1
out.write( '%s\tbed2gff\texon\t%d\t%d\t%s\t%s\t.\texon %s;\n' % ( chrom, exon_start, exon_end, score, strand, group ) )
except:
skipped_lines += 1
if not first_skipped_line:
first_skipped_line = i + 1
else:
skipped_lines += 1
if not first_skipped_line:
first_skipped_line = i + 1
out.close()
info_msg = "%i lines converted to GFF version 2. " % ( i + 1 - skipped_lines )
if skipped_lines > 0:
info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line )
print (info_msg)
if __name__ == "__main__": __main__()