forked from jianshu93/bindash
-
Notifications
You must be signed in to change notification settings - Fork 0
/
how-to-reproduce-benchmark-results.txt
93 lines (60 loc) · 7.18 KB
/
how-to-reproduce-benchmark-results.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# This document describes how to reproduce the results in the paper.
# Before doing anything else, please compile the source code of bindash and mash into executables. Detail about how to compile them are all on github.
# After compiling, make sure that bindash and mash can be found in the system. For example, alias bindash=$HOME/bindash/release/bindash # please modify as needed
# Then, change the current working directory to bindash/benchmark/
# step 1: download only the 120 reference genomes in assembly_summary_2018032200.txt
# we can use the script download-refseq.py for step 1.
# step 2: download all the 110426 genomes in assembly_summary_2018032200.txt into an SSD
# we can use the script download-refseq.py for step 2, but we have to slightly modify the code such that there is no check for the keyword 'reference genome'
# step 3: for each set of files downloaded in step 1 and 2, create the file "${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}"
# Assuming seqs is the directory containing all the files downloaded from step 1 or 2.
# The next six commented lines describe an example of the file content of "${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" (the starting string "# " is excluded)
# seqs/GCF_000003135.1_ASM313v1_genomic.fna.gz
# seqs/GCF_000003215.1_ASM321v1_genomic.fna.gz
# seqs/GCF_000003645.1_ASM364v1_genomic.fna.gz
# <more lines, where each line indicates the filepath of the genomic fasta file of an assembly that we downloaded in step 1 or 2>
# seqs/GCF_900291885.1_CF112_genomic.fna.gz
# seqs/GCF_900291895.1_CF111_genomic.fna.gz
# step 4: evaluate the precision computed by bindash and mash from the 120 genomes downloaded in step 1
FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE=/home/zhaoxiaofei/bindash/input/assembly_bacteria_2018032201/files-refgenomes-raw.txt # please modify as needed
SSD_DIR=/home/zhaoxiaofei/bindash/output/precision/ # please modify as needed
PREFIX="${SSD_DIR}/assembly_bacteria_2018032201_refgenomes_"
mkdir -p "${SSD_DIR}"
# Please make sure that the system has at least 7GB of free RAM available before running the following two commands.
# If the system does not have enough RAM, we can still use the following procedure to generate the same output file.
# 1 - split FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE into multiple smaller files
# 2 - sketch each smaller file, each smaller file can be used as either query-sketch or target-sketch
# 3 - use bindash dist [options] query-sketch target-sketch for each combination of smaller query-sketch file and smaller target-sketch file
# 4 - concatenate the result.tsv files from the previous combinations of query-sketch and target-sketch into one single output file.
bindash sketch --listfname="${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" --outfname=${PREFIX}groundtruth_nthreads8 --nthreads=8 --minhashtype=-1 # -1 means exact jaccard index
bindash dist ${PREFIX}groundtruth_nthreads8 --nthreads=8 --mthres=1e9 > ${PREFIX}groundtruth_nthreads8_result.tsv # with --mthres=1e9, bindash will report all pairwise distances, including zeros.
# 4.1: generate the root-mean-square errors (RMSEs) in Table 1 of the main text.
bindash sketch --listfname="${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" --outfname=${PREFIX}bindash_nthreads8 --nthreads=8 # use default --sketchsize64
bindash dist ${PREFIX}bindash_nthreads8 --nthreads=8 --mthres=1e9 > ${PREFIX}bindash_nthreads8_result.tsv # with --mthres=1e9, bindash will report all pairwise distances, including zeros.
mash sketch -l "${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" -o ${PREFIX}mash_nthreads8 -p 8 # use default -s
mash dist ${PREFIX}mash_nthreads8.msh ${PREFIX}mash_nthreads8.msh -p 8 > ${PREFIX}mash_nthreads8_result.tsv
bindashRMSE=$(python evaluate-output.py ${PREFIX}groundtruth_nthreads8_result.tsv ${PREFIX}bindash_nthreads8_result.tsv | awk 'BEGIN {sumSqrErr = 0; countErr = 0; } { sumSqrErr += $2 * $2; countErr += 1 } END {print sqrt(sumSqrErr / countErr); }') # this is the RMSE (root-mean-square error) produced by bindash
mashRMSE=$(python evaluate-output.py ${PREFIX}groundtruth_nthreads8_result.tsv ${PREFIX}mash_nthreads8_result.tsv | awk 'BEGIN {sumSqrErr = 0; countErr = 0; } { sumSqrErr += $2 * $2; countErr += 1 } END {print sqrt(sumSqrErr / countErr); }') # this is the RMSE (root-mean-square error) produced by mash
echo "#Sketch-size Mash-RMSE BinDash-RMSE" > ${PREFIX}_table1.precision.txt
echo "Default-param-value ${mashRMSE} ${bindashRMSE}" > ${PREFIX}_table1.precision.txt
# 4.2: generate the root-mean-square errors (RMSEs) in Table 1 of the supplementary information.
echo "#sketch-sizes Mash-RMSE BinDash-RMSE" > ${PREFIX}_suppTable1.precision.txt # this tsv file contains data for Table 1 in the supplementary information
for n64bits in 1 2 4 8 16 32 64; do # the sequence 1 2 ... 64 corresponds to the sequence 64 128 ... 4096 in Table 1 in the supplementary information
bindash sketch --listfname="${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" --outfname=${PREFIX}bindash_nthreads8 --nthreads=8 --sketchsize64="${n64bits}"
bindash dist ${PREFIX}bindash_nthreads8 --nthreads=8 --mthres=1e9 > ${PREFIX}bindash_nthreads8_result.tsv # with --mthres=1e9, bindash will report all pairwise distances, including zeros
mash sketch -l "${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" -o ${PREFIX}mash_nthreads8 -p 8 -s $((${n64bits}*64))
mash dist ${PREFIX}mash_nthreads8.msh ${PREFIX}mash_nthreads8.msh -p 8 > ${PREFIX}mash_nthreads8_result.tsv
bindashRMSE=$(python evaluate-output.py ${PREFIX}groundtruth_nthreads8_result.tsv ${PREFIX}bindash_nthreads8_result.tsv | awk 'BEGIN {sumSqrErr = 0; countErr = 0; } { sumSqrErr += $2 * $2; countErr += 1 } END {print sqrt(sumSqrErr / countErr); }') # this is the RMSE (root-mean-square error) produced by bindash
mashRMSE=$(python evaluate-output.py ${PREFIX}groundtruth_nthreads8_result.tsv ${PREFIX}mash_nthreads8_result.tsv | awk 'BEGIN {sumSqrErr = 0; countErr = 0; } { sumSqrErr += $2 * $2; countErr += 1 } END {print sqrt(sumSqrErr / countErr); }') # this is the RMSE (root-mean-square error) produced by mash
echo "$((${n64bits}*64)) ${mashRMSE} ${bindashRMSE}" >> ${PREFIX}_suppTable1.precision.txt
done
# step 5: evaluate the runtime of bindash and mash from all the 110426 genomes downloaded in step 2. Time durations measured by the "time -p" command are used in Table 1 of the main text.
FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE=/home/zhaoxiaofei/bindash/input/assembly_bacteria_2018032201/files-all-raw.txt # please modify as needed
SSD_DIR=/home/zhaoxiaofei/bindash/output/ssd/all # please modify as needed
PREFIX="${SSD_DIR}/ssd_assembly_bacteria_2018032201_"
mkdir -p "${SSD_DIR}"
(time -p bindash sketch --listfname="${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" --outfname=${PREFIX}bindash_nthreads8 --nthreads=8) 2>"${PREFIX}_table1.bindashSketchTime.txt" ;
(time -p bindash dist ${PREFIX}bindash_nthreads8 --nthreads=8 > /dev/null) 2>"${PREFIX}_table1.bindashDistTime.txt" ;
# stdout redirection prevents having a huge output file
(time -p mash sketch -l "${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" -o ${PREFIX}mash_nthreads8 -p 8) 2>"${PREFIX}_table1.mashSketchTime.txt" ;
(time -p mash dist ${PREFIX}mash_nthreads8.msh ${PREFIX}mash_nthreads8.msh -p 8 > /dev/null) 2>"${PREFIX}_table1.mashDistTime.txt" ;