-
Notifications
You must be signed in to change notification settings - Fork 3
/
parse_sam.py
executable file
·135 lines (123 loc) · 4.06 KB
/
parse_sam.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python
import sys
import os
import getopt
import gzip
def usage():
msg = "usage: %s [options]" % os.path.basename(sys.argv[0])
msg += "\n-i input SAM file"
msg += "\n-e max edit distance (default=0)"
msg += "\n-o output prefix"
msg += "\n-v verbose (0/default=1/2)"
msg += "\naim: keep reads matching uniquely and (almost) perfectly"
msg += "\nreturn reads complying to filter in SAM and BED files"
sys.stderr.write( "%s\n" % msg )
inFile = "" # input SAM file
maxEditDist = 0 # max nb of differences (mismatches/indels)
outPrefix = "" # output prefix
verbose = 1
try:
opts, args = getopt.getopt( sys.argv[1:], "hi:e:o:v:" )
except getopt.GetoptError, err:
sys.stderr.write( str(err)+"\n" ); usage(); sys.exit(2)
for o, a in opts:
if o == "-h":
usage(); sys.exit()
elif o == "-i":
inFile = a
elif o == "-e":
maxEditDist = int(a)
elif o == "-o":
outPrefix = a
elif o == "-v":
verbose = int(a)
if inFile == "" or outPrefix == "":
usage(); sys.exit(1)
if verbose > 0:
msg = " ".join( sys.argv )
sys.stderr.write( "%s\n" % msg )
msg = "input SAM: %s" % inFile
msg += "\nmax edit distance: %i" % maxEditDist
sys.stderr.write( "%s\n" % msg )
inH = open( inFile )
outSamFile = "%s.sam" % outPrefix
outSamH = open( outSamFile, "w" )
outBedFile = "%s.bed.gz" % outPrefix
outBedH = gzip.open( outBedFile, "w" )
#outBedH.write( "chrom chromStart chromEnd name score strand\n" )
if verbose > 0:
msg = "outputs: '%s' and '%s'" % ( outSamFile, outBedFile )
sys.stderr.write( "%s\n" % msg )
sys.stderr.flush()
countReads = 0
countMapped = 0
countUniq = 0
countPerfect = 0
countAsPerfect = 0
countSaved = 0
# for each input line (~each read)
while True:
line = inH.readline().replace("\n","")
if line == "": break
if line[0] == "@": continue
countReads += 1
lTok = line.split("\t")
# if the read was mapped
if lTok[1] != "4":
countMapped += 1
# if the read is unique and perfect
isUniq = False
isAsPerfect = False
type_XT = ""
nbBestHits_X0 = 1000000000
nbSubOptHits_X1 = 1000000000
editDist_NM = 1000000000
for opt in lTok[11:]:
if "XT" in opt:
type_XT = opt.split(":")[2]
elif "X0" in opt:
nbBestHits_X0 = int(opt.split(":")[2])
elif "X1" in opt:
nbSubOptHits_X1 = int(opt.split(":")[2])
elif "NM" in opt:
editDist_NM = int(opt.split(":")[2])
if type_XT == "U" and nbBestHits_X0 == 1 and nbSubOptHits_X1 == 0:
isUniq = True
countUniq += 1
if isUniq:
if editDist_NM == 0:
isAsPerfect = True
countPerfect += 1
else:
if editDist_NM <= maxEditDist:
isAsPerfect = True
countAsPerfect += 1
else:
if verbose > 1:
msg = "high edit distance", lTok[0], lTok[11:]
sys.stderr.write( "%s\n" % msg )
# save it
if isUniq and isAsPerfect:
countSaved += 1
# in SAM format
outSamH.write( "%s\n" % line )
# in BED format
length = len(lTok[9])
strand = ""
if lTok[1] == "0":
strand = "+"
elif lTok[1] == "16":
strand = "-"
bed = "%s\t%i\t%i" % ( lTok[2], int(lTok[3])-1, int(lTok[3])+length )
bed += "\t%s\t1000\t%s" % ( lTok[0], strand )
outBedH.write( "%s\n" % bed )
if verbose > 0:
msg = "nb of reads: %i" % countReads
msg += "\nnb of mapped reads: %i" % countMapped
msg += "\nnb of reads mapping uniquely: %i" % countUniq
msg += "\nnb of reads mapping perfectly: %i" % countPerfect
msg += "\nnb of reads mapping almost perfectly: %i" % countAsPerfect
msg += "\nnb of saved reads: %i" % countSaved
sys.stderr.write( "%s\n" % msg )
inH.close()
outSamH.close()