-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtop3_bracken_taxa.py
67 lines (58 loc) · 3.54 KB
/
top3_bracken_taxa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python
"""
Extract the top-three taxa and their read percentages from bracken's tsv-formatted output for each sample. The bracken reports must
be generated at the same level (such as 'S'), and each report must have at least three taxa.
Command:
python top3_bracken_taxa.py -l [list of sample names] -d [directory of input bracken reports] -s [suffix of Bracken reports] -o [output file name]
Dependencies: Python 3, pandas (<1.4.0 - recommend to use v1.3), bracken v2.6+
Copyright (C) 2021 Yu Wan <[email protected]>
Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>.
Creation: 7 Oct 2021; the latest update: 3 June 2022
"""
import os
import sys
import pandas
from argparse import ArgumentParser
def parse_argument():
parser = ArgumentParser(description = "Extract top-three taxa and their read fractions for each sample from its TSV-format Bracken report")
parser.add_argument('-l', '--list', dest = 'list', type = str, required = True, help = "A list of sample names; one name per line.")
parser.add_argument('-d', '--dir', dest = 'dir', type = str, required = True, help = "Directory of input TSV-format Bracken reports")
parser.add_argument('-s', '--suffix', dest = 'suffix', type = str, required = False, default = '_bracken.tsv', help = "Name of the reference sequence in the alignment")
parser.add_argument('-o', '--out', dest = 'out', type = str, required = False, default = 'top3taxa.tsv', help = "Output filename and path")
return parser.parse_args()
def main():
args = parse_argument()
samples = read_sample_names(args.list)
report = pandas.DataFrame(columns = ['Sample', 'Taxon_1', 'Taxon_2', 'Taxon_3', 'Percent_1', 'Percent_2', 'Percent_3']) # Initiate the output data frame
for s in samples:
bracken = read_bracken_report(os.path.join(args.dir, s + args.suffix))
r1 = bracken.loc[0] # Not use bracken.loc[0]['taxon'] or bracken.loc[0]['fraction'] to reduce the number of searching for indices.
r2 = bracken.loc[1]
r3 = bracken.loc[2]
new_line = pandas.DataFrame({'Sample' : [s], 'Taxon_1' : [r1['taxon']], 'Taxon_2' : [r2['taxon']], 'Taxon_3' : [r3['taxon']],\
'Percent_1' : [float(r1['fraction'] * 100)], 'Percent_2' : [float(r2['fraction'] * 100)], 'Percent_3' : [float(r3['fraction'] * 100)]})
report = pandas.concat([report, new_line], ignore_index = True) # Replaces the depreciated method report.append(new_line)
report.to_csv(args.out, float_format = '%.3f', sep = '\t', header = True, index = False) # float_format does not work on pandas v1.4.0+.
return
def read_sample_names(f):
if (os.path.exists(f)):
with open(f, 'r') as input_list:
samples = input_list.read().splitlines()
else:
print("Error: input sample-name list " + f + " is not found.", file = sys.stderr)
sys.exit(1)
return samples
def read_bracken_report(f):
""" Return the top-three taxa and their read fractions of the current sample """
if (os.path.exists(f)):
bracken = pandas.read_csv(f, sep = '\t')
bracken = bracken[['name', 'fraction_total_reads']] # Predefined columns in the bracken report
bracken.columns = ['taxon', 'fraction']
bracken = bracken.sort_values(by = 'fraction', ascending = False).reset_index(drop = True)
bracken = bracken.loc[0 : 2]
else:
print("Error: Bracken report " + f + " is not found.", file = sys.stderr)
sys.exit(1)
return bracken
if __name__ == '__main__':
main()