-
Notifications
You must be signed in to change notification settings - Fork 0
/
fourcline.py
executable file
·145 lines (118 loc) · 3.65 KB
/
fourcline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import fileinput
import os
import pdb
import re
import seeq
import sys
import subprocess
import tempfile
from collections import defaultdict
from gzopen import gzopen
from itertools import izip
TOMAPfname = sys.argv[1] + '_2map'
#pdb.set_trace()
with gzopen(sys.argv[1]) as f, open(TOMAPfname,'w') as g:
for lineno,line in enumerate(f):
# Is a fastq keep only sequence
if lineno % 4 != 1: continue
# Exact search of NlaIII
brcd = line.rstrip().split('CATG')[0]
if len(brcd) == len(line.rstrip()): continue
seq = line.rstrip().split('CATG')[1]
# Cut if there is a MlucI site
dna = seq.split('AATT')[0]
# Write fasta to map it
if not 10 < len(brcd) < 22 : continue
if not 5 < len(dna): continue
g.write('>%s\n%s\n' % (brcd,dna))
# Map the sequences
outfname = sys.argv[1]
INDEX = '/mnt/shared/seq/gem/dm3R5/dm3R5_pT2_unmasked.gem'
# System call to `gem-mapper` passing the desired arguments.
subprocess.call([
'gem-mapper',
'-I', INDEX ,
'-i', TOMAPfname,
'-o', outfname,
'-m3',
'--unique-mapping',
'-T4'])
# Open the mapped file and keep barcode position if uniquely mapped
experiment = open(sys.argv[1] + '_experiment.tsv','w')
BCDICT = defaultdict(int)
MAPPED = sys.argv[1] + '.map'
previousln = ''
with open(MAPPED) as g:
for lineno,line in enumerate(g):
#pdb.set_trace()
items = line.rstrip().split()
if items[-1] == '-': continue
# last column contains chr:strand:pos:matches
bcd = items[0]
loc = ':'.join(items[-1].split(':')[:3])
if len(loc.split(':')) < 3:
print bcd
BCDICT[bcd + ':' + loc] += 1
for k in BCDICT:
items = k.split(':')
if len(items) != 4:
print 'Error in ', k
bcd,chrom,strand,pos = k.split(':')
count = BCDICT[k]
out = (bcd,count,chrom,strand,pos)
experiment.write('%s\t%s\t%s\t%s\t%s\n' % out)
# Append original barcodes with experiment barcode to starcode
originals = sys.argv[2] # bcd count
experiment = sys.argv[1] + '_experiment.tsv'
allbcdsfn = sys.argv[1] + '_allbcd.tsv'
with open(allbcdsfn, 'w') as outf, \
open(originals, 'r') as ori, \
open(experiment, 'r') as ex:
try:
for lineno,line in enumerate(ori):
looping = 'Originals'
bcd,count = line.rstrip().split()[:2]
outf.write(bcd + '\t' + count + '\n')
except ValueError:
pdb.set_trace()
print 'The error is in the Originals file'
print line
try:
for lineno,line in enumerate(ex):
looping = 'barcodes'
bcd,count = line.rstrip().split()[:2]
outf.write(bcd + '\t' + count + '\n')
except ValueError:
print 'The error is in the experiment file'
print line
# Call starcode on the files
starcoded = sys.argv[1] + '_starcoded.txt'
subprocess.call([
'starcode',
'-t4',
'-d2',
'--print-clusters',
'-i' ,
allbcdsfn,
'-o',
starcoded])
# Change barcodes for it's canonical and print final file
CANON = defaultdict(str)
with open(starcoded) as f:
for line in f:
items = line.split()
canonical = items[0]
alternate = list(items[2].split(','))
for bcd in alternate:
CANON[bcd] = canonical
INF = sys.argv[1] + '_experiment.tsv'
FINAL = sys.argv[1] + '_final.tsv'
with open(INF,'r') as f, open(FINAL,'w') as g:
for lineno,line in enumerate(f):
items = line.split()
bcd = items[0]
#count = int(items[1]) - 10^6
rest = items[1:]
newbcd = CANON[bcd]
if newbcd == '': continue
g.write(newbcd + ' ' + ' '.join(rest) + '\n')