-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_kraken_bracken.sh
182 lines (172 loc) · 6.57 KB
/
run_kraken_bracken.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/bin/bash
# Run kraken2 and bracken over a directory reads for taxonomical profiling
#
# Dependencies:
# 1. kraken2 and bracken
# 2. Python v3 and pandas package
# 3. script top3_bracken_taxa.py (in the same code directory)
#
# Copyright (C) 2024 Yu Wan <[email protected]>
# Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>.
# First version: 26 Feb 2022; latest update: 26 Feb 2024
# Function definitions ###############
display_parameters() {
echo "
This script runs kraken2 and bracken over a directory of reads for taxonomical profiling. It assumes paired-end input reads
with filenames following the format: [isolate name]_[1,2].fastq.gz by default. This script does not generate read-level
Kraken reports but only sample-level reports.
Parameters (six in total):
--dir_in=*: directory of input reads, no back slash (mandatory)
--dir_out=*: output directory (default: taxa)
--kraken_db=*: directory of Kraken2's database (mandatory)
--bracken_db=*: directory of Bracken's database (mandatory)
--se: a flag indicating input reads are single-end with filenames following the format [isolate name].fastq.gz (optional)
--min_read_len=*: a uniform minimum length of all input reads (default: 50, a parameter for bracken)
--report=*: filename of the final report of top-3 taxa per sample in the tab-delimited format (default: top3_taxa.tsv)
--threads=*: number of threads (default: 2)
"
}
check_dir() {
if [ ! -d "$1" ]
then
mkdir -p "$1"
fi
}
# Print parameter information ###############
if [ -z "$1" ]
then
display_parameters
exit
else
# https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
# "cd --" is interpreted as "no more options following", so the command is equivalent to cd, which switches to the current user's home directory.
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
if [ ! -f "$SCRIPT_DIR/top3_bracken_taxa.py" ]
then
echo "Error: script top3_bracken_taxa.py was not found in $SCRIPT_DIR" >&2
exit
fi
fi
# Default parameters ###############
treads=2
pe=true
dir_out='taxa'
min_read_len=50
LOG_file="$dir_out"/run_kraken_bracken.log
report="top3_taxa.tsv"
# Read customised parameters
for arg in "$@"
do
case $arg in
--dir_in=*)
dir_in="${arg#*=}"
;;
--dir_out=*)
dir_out="${arg#*=}"
;;
--kraken_db=*)
kraken_db="${arg#*=}"
;;
--bracken_db=*)
bracken_db="${arg#*=}"
;;
--se)
pe=false
;;
--min_read_len=*)
min_read_len="${arg#*=}"
;;
--report=*)
report="${arg#*=}"
;;
--threads=*)
threads="${arg#*=}"
;;
*)
;;
esac
done
if [ ! -d "$kraken_db" ] || [ ! -d "$bracken_db" ]
then
echo "Error: databases $kraken_db and/or $bracken_db do not exist." >&2
exit
else
kraken_output_root="$dir_out/1_kraken"
bracken_output_root="$dir_out/2_bracken"
check_dir "$dir_out"
check_dir $kraken_output_root
check_dir $bracken_output_root
echo "$(date) Pipeline parameters
Input directory: $dir_in
Output directory: $dir_out
Kraken2 database: $kraken_db
Bracken database: $bracken_db
Paired-end reads: $pe
Min. read length: $min_read_len
Top-3 taxa report: $report
No. of threads: $threads
" > $LOG_file
fi
# Run Kraken2 and bracken ###############
if [ -d "$dir_in" ]
then
isolates=()
if [ "$pe" = true ]
then # Run kraken2 for paired-end reads
for r1 in "$dir_in"/*_1.fastq.gz
do
s=$( basename "$r1" '_1.fastq.gz' )
isolates+=( "$s" ) # Add the isolate name into the array
r2="$dir_in/${s}_2.fastq.gz"
kraken_sample_report="$kraken_output_root/${s}_kraken_sample_report.txt"
if [ -f "$kraken_sample_report" ]
then
echo "$(date): skip the kraken analysis for sample $s as its report $kraken_sample_report exists." >> $LOG_file
else
echo "$(date): analysing paired-end reads of sample $s with Kraken2" >> $LOG_file
kraken2 --threads $threads --paired --gzip-compressed --db $kraken_db --report "$kraken_sample_report" --output - $r1 $r2
fi
done
else # Run kraken2 for single-end reads
for r1 in "$dir_in"/*.fastq.gz
do
s=$( basename "$r1" '.fastq.gz' )
isolates+=( "$s" )
kraken_sample_report="$kraken_output_root/${s}_kraken_sample_report.txt"
if [ -f "$kraken_sample_report" ]
then
echo "$(date): skip the kraken analysis for sample $s as its report $kraken_sample_report exists." >> $LOG_file
else
echo "$(date): analysing single-end reads of sample $s with Kraken2" >> $LOG_file
kraken2 --threads $threads --gzip-compressed --db $kraken_db --report "$kraken_sample_report" --output - $r1
fi
done
fi
echo "$(date): Kraken2 analysis completed." >> $LOG_file
echo "$(date): Start bracken analysis" >> $LOG_file
for s in "${isolates[@]}"
do
kraken_report="$kraken_output_root/${s}_kraken_sample_report.txt"
if [ -f "$kraken_report" ]
then
echo "$(date): summarising species-level abundance of sample $s with bracken" >> $LOG_file
bracken -r $min_read_len -t $threads -d $bracken_db -l S -i $kraken_report -o "$bracken_output_root/${s}_bracken.tsv" -w "$bracken_output_root/${s}_bracken.txt"
( head -n 1 "$bracken_output_root/${s}_bracken.tsv" && tail -n +2 "$bracken_output_root/${s}_bracken.tsv" | sort -t$'\t' -k7 -n -r ) > "$bracken_output_root/${s}_bracken_sorted.tsv"
else
echo "$(date): warning from backen: skipped sample $s for the unavailability of $kraken_report" >> $LOG_file
fi
done
echo "$(date): bracken analysis completed." >> $LOG_file
else
echo "Error: input read directory $dir_in does not exist." >&2
exit
fi
# compile bracken results ###############
echo "$(date): Compile bracken reports for top-3 taxa of each sample" >> $LOG_file
isolate_list="$dir_out/isolates.txt"
for i in "${isolates[@]}"
do
echo "$i" >> $isolate_list
done
python "$SCRIPT_DIR"/top3_bracken_taxa.py --list "$isolate_list" --dir "$bracken_output_root" --suffix '_bracken_sorted.tsv' --out "$dir_out/$report" && \
echo "$(date): The pipeline completed successfully." >> $LOG_file