-
Notifications
You must be signed in to change notification settings - Fork 1
/
filter_subreads.py
106 lines (98 loc) · 3.54 KB
/
filter_subreads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env python3
# Roger Volden
'''
This program is going to have to be structured a bit differently from
the other filtering script because it's hard to read in 122G worth of
subreads to query. So instead, what I'll do is collect headers from
all of the isotype files along with their isotype. From there, I'll
iterate through the subreads file and pull out the reads that I need.
I'll use the header to isotype/cell dictionary to assign it to the
correct cell and isotype.
python3 filter_subreads.py . ../../misc/41_subreads.fastq
'''
import sys
import os
def readIso(inDir):
'''
Takes a directory with isotype files and reads them all in.
A dict with the header, cell, and isotype is returned.
isoDict = {header: cell_#_bc_isotype, ...}
'''
isoDict = {}
fileList = os.listdir(inDir)
fileList = [x for x in fileList if x.endswith('_S') or x.endswith('_M')]
for file in fileList:
opened = open(file, 'r')
for line in opened:
line = line.rstrip().split()
header = line[0].split('_')[0]
isoDict[header] = file
opened.close()
return isoDict
def readSubs(inFile, isoDict):
'''
Takes a subread file and the isoDict to print out the reads
into the appropriate cell and isotype files.
'''
lineNum = 0
header, seq, quality = '', '', ''
print('Starting to read in the subreads')
for line in open(inFile):
line = line.rstrip()
if not line: continue
if lineNum % 4 == 0 and line[0] == '@':
if header:
head = '-'.join(header.split('_')[0].split('-')[:5])
if head in isoDict:
outSub = open(isoDict[head] + '_subs.fastq', 'a+')
outSub.write('@' + head + '\n')
outSub.write(seq + '\n+\n')
outSub.write(quality + '\n')
outSub.close()
print(isoDict[head])
header = line[1:]
if lineNum % 4 == 1:
seq = line
if lineNum % 4 == 3:
quality = line
lineNum += 1
if header:
head = '-'.join(header.split('_')[0].split('-')[:5])
if head in isoDict:
outSub = open(isoDict[head] + '_subs.fastq', 'a+')
outSub.write('@' + head + '\n')
outSub.write(seq + '\n+\n')
outSub.write(quality + '\n')
outSub.close()
print(isoDict[head])
# def readsubs(infile, isodict):
# '''
# takes a subread file and the isoDict to print out the reads
# into the appropriate cell and isotype files.
# '''
# linenum = 0
# header, seq, quality = '', '', ''
# print('starting to read in the subreads')
# for line in open(inFile):
# line = line.rstrip()
# if not line: continue
# if lineNum % 4 == 0 and line[0] == '@':
# if header:
# if header.split('_')[0] in isoDict:
# outSub = open(isoDict[header.split('_')[0]] + '_subs.fastq', 'a+')
# outSub.write('@' + header + '\n')
# outSub.write(seq + '\n+\n')
# outSub.write(quality + '\n')
# outSub.close()
# print(isoDict[header.split('_')[0]])
# header = line[1:]
# if lineNum % 4 == 1:
# seq = line
# if lineNum % 4 == 3:
# quality = line
# lineNum += 1
def main():
isoDict = readIso(sys.argv[1])
print('Finished collecting headers')
readSubs(sys.argv[2], isoDict)
main()