-
Notifications
You must be signed in to change notification settings - Fork 3
/
CountRecords.py
67 lines (54 loc) · 2.78 KB
/
CountRecords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# Copyright (c) 2015 William Lees
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit
# persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
# Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# Count FASTA records accounting for duplicate counts in the header
__author__ = 'William Lees'
__docformat__ = "restructuredtext en"
import os
import sys
import argparse
import csv
import re
import numpy as np
import matplotlib.pyplot as plt
def main(argv):
parser = argparse.ArgumentParser(description='Count FASTA records, accounting for duplicate counts in the header')
parser.add_argument('dupheader', help='Prefix for duplicate count, eg "DUPCOUNT=" for Presto, "size=" for usearch')
parser.add_argument('infile', help='input file (FASTA)')
args = parser.parse_args()
record_count = 0
count_with_dupes = 0
dupheader_not_seen = False
with open(args.infile, 'r') as fi:
for line in fi:
if line[0] == '>':
record_count += 1
if args.dupheader in line:
spl = line.split(args.dupheader)
count = None
for i in range(1, len(spl[1])):
if spl[1][0:i].isdigit():
count = int(spl[1][0:i])
else:
break
if count:
count_with_dupes += count
else:
print 'Warning: record count not identified in %s' % line
else:
count_with_dupes +=1
dupheader_not_seen = True
print 'Total FASTA records: %d' % record_count
print 'Total reads (including duplicates): %d' % count_with_dupes
if dupheader_not_seen:
print 'Warning: one or more records did not include a duplicate count (these were assumed to be singletons).'
if __name__=="__main__":
main(sys.argv)