how-to-reproduce-benchmark-results.txt

# This document describes how to reproduce the results in the paper.


# Before doing anything else, please compile the source code of bindash and mash into executables. Detail about how to compile them are all on github. 
# After compiling, make sure that bindash and mash can be found in the system. For example, alias bindash=$HOME/bindash/release/bindash # please modify as needed
# Then, change the current working directory to bindash/benchmark/


# step 1: download only the 120 reference genomes in assembly_summary_2018032200.txt
# we can use the script download-refseq.py for step 1.


# step 2: download all the 110426 genomes in assembly_summary_2018032200.txt into an SSD
# we can use the script download-refseq.py for step 2, but we have to slightly modify the code such that there is no check for the keyword 'reference genome'


# step 3: for each set of files downloaded in step 1 and 2, create the file "${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}"
# Assuming seqs is the directory containing all the files downloaded from step 1 or 2.
# The next six commented lines describe an example of the file content of "${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" (the starting string "# " is excluded)
# seqs/GCF_000003135.1_ASM313v1_genomic.fna.gz
# seqs/GCF_000003215.1_ASM321v1_genomic.fna.gz
# seqs/GCF_000003645.1_ASM364v1_genomic.fna.gz
# <more lines, where each line indicates the filepath of the genomic fasta file of an assembly that we downloaded in step 1 or 2>
# seqs/GCF_900291885.1_CF112_genomic.fna.gz
# seqs/GCF_900291895.1_CF111_genomic.fna.gz


# step 4: evaluate the precision computed by bindash and mash from the 120 genomes downloaded in step 1

FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE=/home/zhaoxiaofei/bindash/input/assembly_bacteria_2018032201/files-refgenomes-raw.txt # please modify as needed
SSD_DIR=/home/zhaoxiaofei/bindash/output/precision/ # please modify as needed

PREFIX="${SSD_DIR}/assembly_bacteria_2018032201_refgenomes_"

mkdir -p "${SSD_DIR}"

# Please make sure that the system has at least 7GB of free RAM available before running the following two commands.
# If the system does not have enough RAM, we can still use the following procedure to generate the same output file.
#   1 - split FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE into multiple smaller files
#   2 - sketch each smaller file, each smaller file can be used as either query-sketch or target-sketch
#   3 - use bindash dist [options] query-sketch target-sketch for each combination of smaller query-sketch file and smaller target-sketch file
#   4 - concatenate the result.tsv files from the previous combinations of query-sketch and target-sketch into one single output file.
bindash sketch --listfname="${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" --outfname=${PREFIX}groundtruth_nthreads8 --nthreads=8 --minhashtype=-1 # -1 means exact jaccard index
bindash dist ${PREFIX}groundtruth_nthreads8 --nthreads=8 --mthres=1e9 > ${PREFIX}groundtruth_nthreads8_result.tsv # with --mthres=1e9, bindash will report all pairwise distances, including zeros.

# 4.1: generate the root-mean-square errors (RMSEs) in Table 1 of the main text.

bindash sketch --listfname="${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" --outfname=${PREFIX}bindash_nthreads8 --nthreads=8 # use default --sketchsize64
bindash dist ${PREFIX}bindash_nthreads8 --nthreads=8 --mthres=1e9 > ${PREFIX}bindash_nthreads8_result.tsv # with --mthres=1e9, bindash will report all pairwise distances, including zeros.

mash sketch -l "${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" -o ${PREFIX}mash_nthreads8 -p 8 # use default -s
mash dist ${PREFIX}mash_nthreads8.msh ${PREFIX}mash_nthreads8.msh -p 8 > ${PREFIX}mash_nthreads8_result.tsv

bindashRMSE=$(python evaluate-output.py ${PREFIX}groundtruth_nthreads8_result.tsv ${PREFIX}bindash_nthreads8_result.tsv | awk 'BEGIN {sumSqrErr = 0; countErr = 0; } { sumSqrErr += $2 * $2; countErr += 1 } END {print sqrt(sumSqrErr / countErr); }') # this is the RMSE (root-mean-square error) produced by bindash

mashRMSE=$(python evaluate-output.py ${PREFIX}groundtruth_nthreads8_result.tsv ${PREFIX}mash_nthreads8_result.tsv | awk 'BEGIN {sumSqrErr = 0; countErr = 0; } { sumSqrErr += $2 * $2; countErr += 1 } END {print sqrt(sumSqrErr / countErr); }') # this is the RMSE (root-mean-square error) produced by mash

echo "#Sketch-size Mash-RMSE BinDash-RMSE" > ${PREFIX}_table1.precision.txt
echo "Default-param-value ${mashRMSE} ${bindashRMSE}" > ${PREFIX}_table1.precision.txt

# 4.2: generate the root-mean-square errors (RMSEs) in Table 1 of the supplementary information.

echo "#sketch-sizes Mash-RMSE BinDash-RMSE" > ${PREFIX}_suppTable1.precision.txt # this tsv file contains data for Table 1 in the supplementary information
for n64bits in 1 2 4 8 16 32 64; do # the sequence 1 2 ... 64 corresponds to the sequence 64 128 ... 4096 in Table 1 in the supplementary information

	bindash sketch --listfname="${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" --outfname=${PREFIX}bindash_nthreads8 --nthreads=8 --sketchsize64="${n64bits}"
	bindash dist ${PREFIX}bindash_nthreads8 --nthreads=8 --mthres=1e9 > ${PREFIX}bindash_nthreads8_result.tsv # with --mthres=1e9, bindash will report all pairwise distances, including zeros

	mash sketch -l "${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" -o ${PREFIX}mash_nthreads8 -p 8 -s $((${n64bits}*64))
	mash dist ${PREFIX}mash_nthreads8.msh ${PREFIX}mash_nthreads8.msh -p 8 > ${PREFIX}mash_nthreads8_result.tsv

	bindashRMSE=$(python evaluate-output.py ${PREFIX}groundtruth_nthreads8_result.tsv ${PREFIX}bindash_nthreads8_result.tsv | awk 'BEGIN {sumSqrErr = 0; countErr = 0; } { sumSqrErr += $2 * $2; countErr += 1 } END {print sqrt(sumSqrErr / countErr); }') # this is the RMSE (root-mean-square error) produced by bindash

	mashRMSE=$(python evaluate-output.py ${PREFIX}groundtruth_nthreads8_result.tsv ${PREFIX}mash_nthreads8_result.tsv | awk 'BEGIN {sumSqrErr = 0; countErr = 0; } { sumSqrErr += $2 * $2; countErr += 1 } END {print sqrt(sumSqrErr / countErr); }') # this is the RMSE (root-mean-square error) produced by mash
	
	echo "$((${n64bits}*64)) ${mashRMSE} ${bindashRMSE}" >> ${PREFIX}_suppTable1.precision.txt
done

# step 5: evaluate the runtime of bindash and mash from all the 110426 genomes downloaded in step 2. Time durations measured by the "time -p" command are used in Table 1 of the main text.

FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE=/home/zhaoxiaofei/bindash/input/assembly_bacteria_2018032201/files-all-raw.txt # please modify as needed
SSD_DIR=/home/zhaoxiaofei/bindash/output/ssd/all # please modify as needed

PREFIX="${SSD_DIR}/ssd_assembly_bacteria_2018032201_"

mkdir -p "${SSD_DIR}"

(time -p bindash sketch --listfname="${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" --outfname=${PREFIX}bindash_nthreads8 --nthreads=8) 2>"${PREFIX}_table1.bindashSketchTime.txt" ;
(time -p bindash dist ${PREFIX}bindash_nthreads8 --nthreads=8 > /dev/null) 2>"${PREFIX}_table1.bindashDistTime.txt" ; 
# stdout redirection prevents having a huge output file

(time -p mash sketch -l "${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" -o ${PREFIX}mash_nthreads8 -p 8) 2>"${PREFIX}_table1.mashSketchTime.txt" ;
(time -p mash dist ${PREFIX}mash_nthreads8.msh ${PREFIX}mash_nthreads8.msh -p 8 > /dev/null) 2>"${PREFIX}_table1.mashDistTime.txt" ;