-
Notifications
You must be signed in to change notification settings - Fork 1
/
fastq-rmdup.py
executable file
·53 lines (45 loc) · 1.51 KB
/
fastq-rmdup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env python3
import sys
import argparse
# ----- command line parsing -----
parser = argparse.ArgumentParser(
description="Removes PCR duplicates from reads")
parser.add_argument("R1_input", type=str,
help="Input file for R1.")
parser.add_argument("R2_input", type=str,
help="Input file for R2.")
parser.add_argument("R1_output", type=str,
help="output file for R1.")
parser.add_argument("R2_output", type=str,
help="Output file for R2.")
args = parser.parse_args()
# ----- end command line parsing -----
R1in = open(args.R1_input)
R2in = open(args.R2_input)
R1out = open(args.R1_output, 'w')
R2out = open(args.R2_output, 'w')
pairs = set()
nlines = 0
for a,b in zip(R1in, R2in):
a = a[:-1]
b = b[:-1]
if nlines % 4 == 0:
name1 = a
name2 = b
elif nlines % 4 == 1:
seq1 = a
seq2 = b
elif nlines % 4 == 2:
thing1 = a
thing2 = b
elif nlines % 4 == 3:
qual1 = a
qual2 = b
if seq1 + seq2 not in pairs:
pairs.add(seq1 + seq2)
R1out.write(name1 + '\n' + seq1 + '\n' + thing1 + '\n' + qual1 + '\n')
R2out.write(name2 + '\n' + seq2 + '\n' + thing2 + '\n' + qual2 + '\n')
nlines += 1
print("Duplicates removed: {:d}/{:d} ({:f}%).".format(nlines//4-len(pairs),
nlines//4,
((nlines/4-len(pairs))/(nlines/4))*100))