-
Notifications
You must be signed in to change notification settings - Fork 153
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #208 from paradedb/main
Update ParadeDB Results to v0.8.4
- Loading branch information
Showing
12 changed files
with
266 additions
and
309 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,23 +1,27 @@ | ||
# ParadeDB | ||
|
||
ParadeDB is an alternative to Elasticsearch built on Postgres. | ||
ParadeDB is an Elasticsearch alternative built on Postgres. | ||
|
||
- [GitHub](https://github.com/paradedb/paradedb) | ||
- [Homepage](https://paradedb.com) | ||
|
||
The published benchmarks are based on ParadeDB version `v0.5.4`. | ||
The published benchmarks are based on ParadeDB version `v0.8.4`. | ||
|
||
## Benchmarks | ||
|
||
To run the benchmarks yourself: | ||
To run the benchmarks: | ||
|
||
1. Manually start an AWS EC2 instance | ||
- `c6a.4xlarge` | ||
- Ubuntu Server 22.04 LTS (HVM), SSD Volume Type | ||
- Root 500GB gp2 SSD | ||
- Ubuntu Server 22.04 LTS (HVM), SSD Volume Type\* | ||
- Root 500GB gp2 SSD\*\* | ||
2. Wait for the status check to pass, then SSH into the instance via EC2 Instance Connect | ||
3. Clone this repository via `git clone https://github.com/ClickHouse/ClickBench` | ||
4. Navigate to the `paradedb` directory via `cd ClickBench/paradedb` | ||
5. Run the benchmark via `./benchmark.sh` | ||
5. Run the benchmark via `./benchmark.sh`. This will run the benchmarks against the default settings below. | ||
|
||
The benchmark should be completed in under an hour. If you'd like to benchmark against a different version of ParadeDB, modify the Docker tag in `benchmark.sh`. You can find the list of available tags [here](https://hub.docker.com/r/paradedb/paradedb/tags). | ||
The benchmark script takes the following parameters: | ||
|
||
- `-w` - Type of workload, either `single` or `partitioned`. The default is `single`, meaning it uses the `hits.parquet` ClickBench dataset. The `partitioned` option uses the Clickbench partitioned dataset. | ||
|
||
The benchmark should be completed within a few minutes. If you'd like to benchmark against a different version of ParadeDB, modify the Docker tag in the `benchmark.sh` script. You can find the list of available tags [here](https://hub.docker.com/r/paradedb/paradedb/tags). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,65 +1,107 @@ | ||
#!/bin/bash | ||
|
||
# Cleanup function to reset the environment | ||
PARADEDB_VERSION=0.8.4 | ||
FLAG_WORKLOAD=single | ||
|
||
usage() { | ||
echo "Usage: $0 [OPTIONS]" | ||
echo "Options:" | ||
echo " -h (optional), Display this help message" | ||
echo " -w (optional), Workload type, either <single> or <partitioned>. Default is <single>." | ||
exit 1 | ||
} | ||
|
||
cleanup() { | ||
echo "" | ||
echo "Cleaning up..." | ||
if sudo docker ps -q --filter "name=paradedb" | grep -q .; then | ||
sudo docker kill paradedb | ||
fi | ||
sudo docker rm paradedb | ||
echo "Done, goodbye!" | ||
echo "Done, goodbye!" | ||
} | ||
|
||
# Register the cleanup function to run when the script exits | ||
trap cleanup EXIT | ||
|
||
sudo apt-get update | ||
sudo apt-get install -y docker.io | ||
sudo apt-get install -y postgresql-client | ||
while getopts "hw:" flag | ||
do | ||
case $flag in | ||
h) | ||
usage | ||
;; | ||
w) | ||
FLAG_WORKLOAD=$OPTARG | ||
case "$FLAG_WORKLOAD" in single | partitioned): | ||
;; | ||
*) | ||
usage | ||
;; | ||
esac | ||
;; | ||
*) | ||
usage | ||
;; | ||
esac | ||
done | ||
|
||
if [ ! -e hits.tsv ]; then | ||
echo "" | ||
echo "Downloading dataset..." | ||
wget --no-verbose --continue 'https://datasets.clickhouse.com/hits_compatible/hits.tsv.gz' | ||
gzip -d hits.tsv.gz | ||
else | ||
echo "" | ||
echo "Dataset already downloaded, skipping..." | ||
fi | ||
echo "" | ||
echo "Installing dependencies..." | ||
sudo apt-get update -y | ||
sudo apt-get install -y docker.io postgresql-client | ||
|
||
echo "" | ||
echo "Pulling ParadeDB image..." | ||
sudo docker run \ | ||
-e POSTGRES_USER=myuser \ | ||
-e POSTGRES_PASSWORD=mypassword \ | ||
-e POSTGRES_DB=mydb \ | ||
-p 5432:5432 \ | ||
--name paradedb \ | ||
-d \ | ||
paradedb/paradedb:0.5.4 | ||
--name paradedb \ | ||
-e POSTGRESQL_USERNAME=myuser \ | ||
-e POSTGRESQL_PASSWORD=mypassword \ | ||
-e POSTGRESQL_DATABASE=mydb \ | ||
-e POSTGRESQL_POSTGRES_PASSWORD=postgres \ | ||
-p 5432:5432 \ | ||
-d \ | ||
paradedb/paradedb:$PARADEDB_VERSION | ||
|
||
echo "" | ||
echo "Waiting for ParadeDB to start..." | ||
sleep 10 | ||
echo "ParadeDB is ready!" | ||
|
||
echo "" | ||
echo "Downloading ClickBench dataset ($FLAG_WORKLOAD)..." | ||
if [ $FLAG_WORKLOAD == "single" ]; then | ||
if [ ! -e /tmp/hits.parquet ]; then | ||
wget --no-verbose --continue -O /tmp/hits.parquet https://datasets.clickhouse.com/hits_compatible/hits.parquet | ||
fi | ||
if ! sudo docker exec paradedb sh -c '[ -f /tmp/hits.parquet ]'; then | ||
sudo docker cp /tmp/hits.parquet paradedb:/tmp/hits.parquet | ||
fi | ||
elif [ $FLAG_WORKLOAD == "partitioned" ]; then | ||
if [ ! -e /tmp/partitioned/ ]; then | ||
mkdir -p /tmp/partitioned | ||
seq 0 99 | xargs -P100 -I{} bash -c 'wget --no-verbose --directory-prefix /tmp/partitioned --continue https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet' | ||
fi | ||
if ! sudo docker exec paradedb sh -c '[ -f /tmp/partitioned ]'; then | ||
sudo docker cp /tmp/partitioned paradedb:tmp | ||
fi | ||
else | ||
echo "Invalid workload type: $FLAG_WORKLOAD" | ||
exit 1 | ||
fi | ||
|
||
echo "" | ||
echo "Loading dataset..." | ||
export PGPASSWORD='mypassword' | ||
psql -h localhost -U myuser -d mydb -p 5432 -t < create.sql | ||
psql -h localhost -U myuser -d mydb -p 5432 -t -c 'CALL paradedb.init();' -c '\timing' -c "\\copy hits FROM 'hits.tsv'" | ||
echo "Creating database..." | ||
export PGPASSWORD='postgres' | ||
if [ $FLAG_WORKLOAD == "single" ]; then | ||
psql -h localhost -U postgres -d mydb -p 5432 -t < create-single.sql | ||
else | ||
psql -h localhost -U postgres -d mydb -p 5432 -t < create-partitioned.sql | ||
fi | ||
|
||
# COPY 99997497 | ||
# Time: 1268695.244 ms (21:08.695) | ||
# load_time is zero, since the data is directly read from the Parquet file(s) | ||
# Time: 0000000.000 ms (00:00.000) | ||
|
||
echo "" | ||
echo "Running queries..." | ||
./run.sh 2>&1 | tee log.txt | ||
|
||
sudo docker exec -it paradedb du -bcs /var/lib/postgresql/data | ||
|
||
# 15415061091 /var/lib/postgresql/data | ||
# 15415061091 total | ||
# data_size is the Parquet file(s) total size | ||
# 14779976446 | ||
|
||
echo "" | ||
echo "Parsing results..." | ||
cat log.txt | grep -oP 'Time: \d+\.\d+ ms' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/' | | ||
awk '{ if (i % 3 == 0) { printf "[" }; printf $1 / 1000; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' | ||
awk '{ if (i % 3 == 0) { printf "[" }; printf $1 / 1000; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
CREATE FOREIGN DATA WRAPPER parquet_wrapper | ||
HANDLER parquet_fdw_handler | ||
VALIDATOR parquet_fdw_validator; | ||
|
||
CREATE SERVER parquet_server | ||
FOREIGN DATA WRAPPER parquet_wrapper; | ||
|
||
CREATE FOREIGN TABLE hits () | ||
SERVER parquet_server | ||
OPTIONS (files '/tmp/partitioned/*.parquet'); |
Oops, something went wrong.