From eb9a702bbfc80a3b9b2ff1089f845e22d3fe5607 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 20 Jul 2023 15:22:05 -0400 Subject: [PATCH] Add support for ClickBench in bench.sh (#7005) * Add support for ClickBench in bench.sh * Update benchmarks/bench.sh --- benchmarks/bench.sh | 92 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 84 insertions(+), 8 deletions(-) diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh index 05236ad5ade6..f71094a42549 100755 --- a/benchmarks/bench.sh +++ b/benchmarks/bench.sh @@ -18,7 +18,9 @@ # This script is meant for developers of DataFusion -- it is runnable # from the standard DataFusion development environment and uses cargo, -# etc. +# etc and orchestrates gathering data and run the benchmark binary in +# different configurations. + # Exit on error set -e @@ -64,12 +66,14 @@ compare: Comares results from benchmark runs * Benchmarks ********** all(default): Data/Run/Compare for all benchmarks -tpch: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single parquet file per table -tpch_mem: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), query from memory -tpch10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), single parquet file per table -tpch10_mem: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), query from memory -parquet: Benchmark of parquet reader's filtering speed -sort: Benchmark of sorting speed +tpch: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single parquet file per table +tpch_mem: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), query from memory +tpch10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), single parquet file per table +tpch10_mem: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), query from memory +parquet: Benchmark of parquet reader's filtering speed +sort: Benchmark of sorting speed +clickbench_1: ClickBench queries against a single parquet file +clickbench_partitioned: ClickBench queries against a partitioned (100 files) parquet ********** * Supported Configuration (Environment Variables) @@ -118,7 +122,7 @@ main() { data) BENCHMARK=${ARG2:-"${BENCHMARK}"} echo "***************************" - echo "DataFusion Benchmark Data Generation" + echo "DataFusion Benchmark Runner and Data Generator" echo "COMMAND: ${COMMAND}" echo "BENCHMARK: ${BENCHMARK}" echo "DATA_DIR: ${DATA_DIR}" @@ -128,6 +132,8 @@ main() { all) data_tpch "1" data_tpch "10" + data_clickbench_1 + data_clickbench_partitioned ;; tpch) data_tpch "1" @@ -143,6 +149,12 @@ main() { # same data as for tpch10 data_tpch "10" ;; + clickbench_1) + data_clickbench_1 + ;; + clickbench_partitioned) + data_clickbench_partitioned + ;; *) echo "Error: unknown benchmark '$BENCHMARK' for data generation" usage @@ -178,6 +190,8 @@ main() { run_tpch_mem "10" run_parquet run_sort + run_clickbench_1 + run_clickbench_partitioned ;; tpch) run_tpch "1" @@ -197,6 +211,12 @@ main() { sort) run_sort ;; + clickbench_1) + run_clickbench_1 + ;; + clickbench_partitioned) + run_clickbench_partitioned + ;; *) echo "Error: unknown benchmark '$BENCHMARK' for run" usage @@ -318,6 +338,62 @@ run_sort() { $CARGO_COMMAND --bin parquet -- sort --path "${DATA_DIR}" --scale-factor 1.0 --iterations 10 -o ${RESULTS_FILE} } + +# Downloads the single file hits.parquet ClickBench datasets from +# https://github.com/ClickHouse/ClickBench/tree/main#data-loading +# +# Creates data in $DATA_DIR/hits.parquet +data_clickbench_1() { + pushd "${DATA_DIR}" > /dev/null + + # Avoid downloading if it already exists and is the right size + OUTPUT_SIZE=`wc -c hits.parquet 2>/dev/null | awk '{print $1}' || true` + echo -n "Checking hits.parquet..." + if test "${OUTPUT_SIZE}" = "14779976446"; then + echo -n "... found ${OUTPUT_SIZE} bytes ..." + else + URL="https://datasets.clickhouse.com/hits_compatible/hits.parquet" + echo -n "... downloading ${URL} (14GB) ... " + wget --continue ${URL} + fi + echo " Done" + popd > /dev/null +} + +# Downloads the 100 file partitioned ClickBench datasets from +# https://github.com/ClickHouse/ClickBench/tree/main#data-loading +# +# Creates data in $DATA_DIR/hits_partitioned +data_clickbench_partitioned() { + MAX_CONCURRENT_DOWNLOADS=10 + + mkdir -p "${DATA_DIR}/hits_partitioned" + pushd "${DATA_DIR}/hits_partitioned" > /dev/null + + echo -n "Checking hits_partitioned..." + OUTPUT_SIZE=`wc -c * 2>/dev/null | tail -n 1 | awk '{print $1}' || true` + if test "${OUTPUT_SIZE}" = "14737666736"; then + echo -n "... found ${OUTPUT_SIZE} bytes ..." + else + echo -n " downloading with ${MAX_CONCURRENT_DOWNLOADS} parallel workers" + seq 0 99 | xargs -P${MAX_CONCURRENT_DOWNLOADS} -I{} bash -c 'wget -q --continue https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet && echo -n "."' + fi + + echo " Done" + popd > /dev/null +} + + +# Runs the clickbench benchmark with a single large parquet file +run_clickbench_1() { + echo "NOTICE: ClickBench (1 parquet file) is not yet supported" +} + + # Runs the clickbench benchmark with a single large parquet file +run_clickbench_partitioned() { + echo "NOTICE: ClickBench (1 parquet file) is not yet supported" +} + compare_benchmarks() { BASE_RESULTS_DIR="${SCRIPT_DIR}/results" BRANCH1="${ARG2}"