Skip to content

Commit

Permalink
Implement the "Create Dumps from DBs" workflow
Browse files Browse the repository at this point in the history
Periodically running workflow running on GCE that creates parquet
exports from postgres databases restored from teiserver and replay
database backup files.
  • Loading branch information
p2004a committed Sep 9, 2024
1 parent edaae94 commit 530563d
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 4 deletions.
71 changes: 67 additions & 4 deletions .github/workflows/create-source-data.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
name: Create Source Data
name: Create Dumps from DBs
on:
workflow_dispatch:
schedule:
- cron: '0 10 * * 2,6'
jobs:
create-runner:
permissions:
Expand Down Expand Up @@ -28,16 +30,77 @@ jobs:
with:
token: ${{ steps.app-token.outputs.token }}
image_project: ubuntu-os-cloud
image_family: ubuntu-2204-lts
image_family: ubuntu-2404-lts-amd64
machine_zone: europe-west4-b
machine_type: e2-standard-4
runner_service_account: ${{ vars.RUNNER_GCP_SERVICE_ACCOUNT }}
preemptible: true
ephemeral: true
boot_disk_type: pd-ssd
disk_size: 70GB
test:
export-pgdumps:
needs: create-runner
runs-on: ${{ needs.create-runner.outputs.label }}
steps:
- run: echo "This runs on the GCE VM"
# We are running on barebones VM, so there is more scripting involved
# then needed if we were running on standard GitHub Actions runner.
- name: Checkout source
run: |
mkdir src
cd src
git init
git remote add origin $GITHUB_SERVER_URL/$GITHUB_REPOSITORY.git
git fetch origin $GITHUB_REF
git reset --hard FETCH_HEAD
cd ..
- name: Set up PostgreSQL
run: |
sudo apt-get --yes install postgresql
while ! pg_isready; do
echo "waiting for postgres..."
sleep 1
done
sudo -u postgres psql -c "ALTER USER postgres PASSWORD '12345';"
- name: Setup DuckDB
run: |
sudo apt-get install --yes unzip
curl -L https://github.com/duckdb/duckdb/releases/download/v1.1.0/duckdb_cli-linux-amd64.zip > duckdb.zip
unzip duckdb.zip duckdb
sudo mv duckdb /usr/local/bin
export HOME=$(pwd)
duckdb :memory: 'INSTALL postgres;'
- name: Restore databases
run: |
function restore {
local BACKUP="$(gcloud storage ls gs://$1 | sort -r | head -n 1)"
gcloud storage cp "$BACKUP" .
psql -c "CREATE DATABASE $2;"
time zstdcat "$(basename "$BACKUP")" \
| pg_restore -d postgres --clean --create --no-owner --no-privileges
}
restore "$REPLAY_BACKUPS_GCS_BUCKET" bar &
restore "$TEISERVER_BACKUPS_GCS_BUCKET" teiserver_prod &
wait %1 %2
env:
REPLAY_BACKUPS_GCS_BUCKET: ${{ vars.REPLAY_BACKUPS_GCS_BUCKET }}
TEISERVER_BACKUPS_GCS_BUCKET: ${{ vars.TEISERVER_BACKUPS_GCS_BUCKET }}
PGPASSWORD: 12345
PGHOST: 127.0.0.1
PGUSER: postgres
- name: Export parquet files
run: |
mkdir data_export
export HOME=$(pwd)
duckdb < src/scripts/export_prod_data_source.sql
env:
PGPASSWORD: 12345
PGHOST: 127.0.0.1
PGUSER: postgres
- name: Save data export in GCS bucket
run: |
gcloud config set storage/parallel_composite_upload_compatibility_check False
gcloud storage rsync data_export/ gs://$DATA_MART_GCS_BUCKET/pgdumps --recursive --delete-unmatched-destination-objects
env:
DATA_MART_GCS_BUCKET: ${{ vars.DATA_MART_GCS_BUCKET }}
14 changes: 14 additions & 0 deletions scripts/export_prod_data_source.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
-- noqa: disable=all

ATTACH 'dbname=teiserver_prod' AS teiserver (TYPE POSTGRES, READ_ONLY);

COPY teiserver.public.teiserver_battle_matches TO 'data_export/teiserver_battle_matches.parquet' (FORMAT 'parquet', CODEC 'zstd', COMPRESSION_LEVEL 9);
COPY teiserver.public.teiserver_battle_match_memberships TO 'data_export/teiserver_battle_match_memberships.parquet' (FORMAT 'parquet', CODEC 'zstd', COMPRESSION_LEVEL 9);
COPY teiserver.public.teiserver_game_rating_logs TO 'data_export/teiserver_game_rating_logs.parquet' (FORMAT 'parquet', CODEC 'zstd', COMPRESSION_LEVEL 9);

ATTACH 'dbname=bar' AS replay (TYPE POSTGRES, READ_ONLY);

COPY replay.public.Demos TO 'data_export/replay_demos.parquet' (FORMAT 'parquet', CODEC 'zstd', COMPRESSION_LEVEL 9);
COPY replay.public.AllyTeams TO 'data_export/replay_ally_teams.parquet' (FORMAT 'parquet', CODEC 'zstd', COMPRESSION_LEVEL 9);
COPY replay.public.Players TO 'data_export/replay_players.parquet' (FORMAT 'parquet', CODEC 'zstd', COMPRESSION_LEVEL 9);
COPY replay.public.Maps TO 'data_export/replay_maps.parquet' (FORMAT 'parquet', CODEC 'zstd', COMPRESSION_LEVEL 9);

0 comments on commit 530563d

Please sign in to comment.