Skip to content

Commit

Permalink
Add support for ClickBench in bench.sh (#7005)
Browse files Browse the repository at this point in the history
* Add support for ClickBench in bench.sh

* Update benchmarks/bench.sh
  • Loading branch information
alamb authored Jul 20, 2023
1 parent 5907c21 commit eb9a702
Showing 1 changed file with 84 additions and 8 deletions.
92 changes: 84 additions & 8 deletions benchmarks/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@

# This script is meant for developers of DataFusion -- it is runnable
# from the standard DataFusion development environment and uses cargo,
# etc.
# etc and orchestrates gathering data and run the benchmark binary in
# different configurations.


# Exit on error
set -e
Expand Down Expand Up @@ -64,12 +66,14 @@ compare: Comares results from benchmark runs
* Benchmarks
**********
all(default): Data/Run/Compare for all benchmarks
tpch: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single parquet file per table
tpch_mem: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), query from memory
tpch10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), single parquet file per table
tpch10_mem: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), query from memory
parquet: Benchmark of parquet reader's filtering speed
sort: Benchmark of sorting speed
tpch: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single parquet file per table
tpch_mem: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), query from memory
tpch10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), single parquet file per table
tpch10_mem: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), query from memory
parquet: Benchmark of parquet reader's filtering speed
sort: Benchmark of sorting speed
clickbench_1: ClickBench queries against a single parquet file
clickbench_partitioned: ClickBench queries against a partitioned (100 files) parquet
**********
* Supported Configuration (Environment Variables)
Expand Down Expand Up @@ -118,7 +122,7 @@ main() {
data)
BENCHMARK=${ARG2:-"${BENCHMARK}"}
echo "***************************"
echo "DataFusion Benchmark Data Generation"
echo "DataFusion Benchmark Runner and Data Generator"
echo "COMMAND: ${COMMAND}"
echo "BENCHMARK: ${BENCHMARK}"
echo "DATA_DIR: ${DATA_DIR}"
Expand All @@ -128,6 +132,8 @@ main() {
all)
data_tpch "1"
data_tpch "10"
data_clickbench_1
data_clickbench_partitioned
;;
tpch)
data_tpch "1"
Expand All @@ -143,6 +149,12 @@ main() {
# same data as for tpch10
data_tpch "10"
;;
clickbench_1)
data_clickbench_1
;;
clickbench_partitioned)
data_clickbench_partitioned
;;
*)
echo "Error: unknown benchmark '$BENCHMARK' for data generation"
usage
Expand Down Expand Up @@ -178,6 +190,8 @@ main() {
run_tpch_mem "10"
run_parquet
run_sort
run_clickbench_1
run_clickbench_partitioned
;;
tpch)
run_tpch "1"
Expand All @@ -197,6 +211,12 @@ main() {
sort)
run_sort
;;
clickbench_1)
run_clickbench_1
;;
clickbench_partitioned)
run_clickbench_partitioned
;;
*)
echo "Error: unknown benchmark '$BENCHMARK' for run"
usage
Expand Down Expand Up @@ -318,6 +338,62 @@ run_sort() {
$CARGO_COMMAND --bin parquet -- sort --path "${DATA_DIR}" --scale-factor 1.0 --iterations 10 -o ${RESULTS_FILE}
}


# Downloads the single file hits.parquet ClickBench datasets from
# https://github.com/ClickHouse/ClickBench/tree/main#data-loading
#
# Creates data in $DATA_DIR/hits.parquet
data_clickbench_1() {
pushd "${DATA_DIR}" > /dev/null

# Avoid downloading if it already exists and is the right size
OUTPUT_SIZE=`wc -c hits.parquet 2>/dev/null | awk '{print $1}' || true`
echo -n "Checking hits.parquet..."
if test "${OUTPUT_SIZE}" = "14779976446"; then
echo -n "... found ${OUTPUT_SIZE} bytes ..."
else
URL="https://datasets.clickhouse.com/hits_compatible/hits.parquet"
echo -n "... downloading ${URL} (14GB) ... "
wget --continue ${URL}
fi
echo " Done"
popd > /dev/null
}

# Downloads the 100 file partitioned ClickBench datasets from
# https://github.com/ClickHouse/ClickBench/tree/main#data-loading
#
# Creates data in $DATA_DIR/hits_partitioned
data_clickbench_partitioned() {
MAX_CONCURRENT_DOWNLOADS=10

mkdir -p "${DATA_DIR}/hits_partitioned"
pushd "${DATA_DIR}/hits_partitioned" > /dev/null

echo -n "Checking hits_partitioned..."
OUTPUT_SIZE=`wc -c * 2>/dev/null | tail -n 1 | awk '{print $1}' || true`
if test "${OUTPUT_SIZE}" = "14737666736"; then
echo -n "... found ${OUTPUT_SIZE} bytes ..."
else
echo -n " downloading with ${MAX_CONCURRENT_DOWNLOADS} parallel workers"
seq 0 99 | xargs -P${MAX_CONCURRENT_DOWNLOADS} -I{} bash -c 'wget -q --continue https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet && echo -n "."'
fi

echo " Done"
popd > /dev/null
}


# Runs the clickbench benchmark with a single large parquet file
run_clickbench_1() {
echo "NOTICE: ClickBench (1 parquet file) is not yet supported"
}

# Runs the clickbench benchmark with a single large parquet file
run_clickbench_partitioned() {
echo "NOTICE: ClickBench (1 parquet file) is not yet supported"
}

compare_benchmarks() {
BASE_RESULTS_DIR="${SCRIPT_DIR}/results"
BRANCH1="${ARG2}"
Expand Down

0 comments on commit eb9a702

Please sign in to comment.