Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new spark settings #89

Draft
wants to merge 3 commits into
base: Release/snowplow-unified/0.5.1
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 42 additions & 12 deletions .github/workflows/spark_deployment/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ networks:
services:
spark-master:
image: snowplow/spark-s3-iceberg:latest
command: ["/bin/bash", "-c", "/spark/sbin/start-master.sh -h spark-master --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.master.Master-1-*.out"]
hostname: spark-master
ports:
- '8080:8080'
Expand All @@ -16,51 +15,82 @@ services:
- SPARK_LOCAL_IP=spark-master
- SPARK_MASTER_HOST=spark-master
- SPARK_MASTER_PORT=7077
- SPARK_MASTER_OPTS="-Dspark.driver.memory=2g"
# AWS credentials
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
- AWS_REGION=eu-west-1
- AWS_DEFAULT_REGION=eu-west-1
volumes:
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf.template
- ./setup.sh:/setup.sh
- /tmp/spark-temp:/tmp/spark-temp
- /tmp/s3a:/tmp/s3a
deploy:
resources:
limits:
memory: 4G
reservations:
memory: 2G
entrypoint: ["/bin/bash", "/setup.sh"]
command: ["/bin/bash", "-c", "/spark/sbin/start-master.sh -h spark-master --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.master.Master-1-*.out"]
networks:
- spark-network

spark-worker:
image: snowplow/spark-s3-iceberg:latest
command: ["/bin/bash", "-c", "sleep 10 && /spark/sbin/start-worker.sh spark://spark-master:7077 --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.worker.Worker-*.out"]
depends_on:
- spark-master
environment:
- SPARK_WORKER_CORES=2
- SPARK_WORKER_MEMORY=4G
- SPARK_EXECUTOR_MEMORY=3G
- SPARK_WORKER_MEMORY=3G
- SPARK_EXECUTOR_MEMORY=2G
- SPARK_LOCAL_IP=spark-worker
- SPARK_MASTER=spark://spark-master:7077
# AWS credentials
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
- AWS_REGION=eu-west-1
- AWS_DEFAULT_REGION=eu-west-1
volumes:
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf.template
- ./setup.sh:/setup.sh
- /tmp/spark-temp:/tmp/spark-temp
- /tmp/s3a:/tmp/s3a
deploy:
resources:
limits:
memory: 8G
reservations:
memory: 4G
entrypoint: ["/bin/bash", "/setup.sh"]
command: ["/bin/bash", "-c", "sleep 10 && /spark/sbin/start-worker.sh spark://spark-master:7077 --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.worker.Worker-*.out"]
networks:
- spark-network

thrift-server:
image: snowplow/spark-s3-iceberg:latest
command: ["/bin/bash", "-c", "sleep 30 && /spark/sbin/start-thriftserver.sh --master spark://spark-master:7077 --driver-memory 2g --executor-memory 3g --hiveconf hive.server2.thrift.port=10000 --hiveconf hive.server2.thrift.bind.host=0.0.0.0 --conf spark.sql.hive.thriftServer.async=true --conf spark.sql.hive.thriftServer.workerQueue.size=2000 --conf spark.sql.hive.thriftServer.maxWorkerThreads=100 --conf spark.sql.hive.thriftServer.minWorkerThreads=50 && tail -f /spark/logs/spark--org.apache.spark.sql.hive.thriftserver.HiveThriftServer2-*.out"]
ports:
- '10000:10000'
- '4040:4040'
depends_on:
- spark-master
- spark-worker
environment:
- SPARK_LOCAL_IP=thrift-server
# AWS credentials
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
- AWS_REGION=eu-west-1
- AWS_DEFAULT_REGION=eu-west-1
volumes:
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf.template
- ./setup.sh:/setup.sh
- /tmp/spark-temp:/tmp/spark-temp
- /tmp/s3a:/tmp/s3a
deploy:
resources:
limits:
memory: 4G
reservations:
memory: 2G
entrypoint: ["/bin/bash", "/setup.sh"]
command: ["/bin/bash", "-c", "sleep 30 && /spark/sbin/start-thriftserver.sh --master spark://spark-master:7077 --conf spark.driver.memory=2g --conf spark.executor.memory=2g --hiveconf hive.server2.thrift.port=10000 --hiveconf hive.server2.thrift.bind.host=0.0.0.0 && tail -f /spark/logs/spark--org.apache.spark.sql.hive.thriftserver.HiveThriftServer2-*.out"]
networks:
- spark-network
13 changes: 13 additions & 0 deletions .github/workflows/spark_deployment/setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

# Create a new spark-defaults.conf with substituted values
sed -e "s|\${AWS_ACCESS_KEY_ID}|$AWS_ACCESS_KEY_ID|g" \
-e "s|\${AWS_SECRET_ACCESS_KEY}|$AWS_SECRET_ACCESS_KEY|g" \
-e "s|\${AWS_REGION}|$AWS_REGION|g" \
-e "s|\${S3_BUCKET}|$S3_BUCKET|g" \
-e "s|\${S3_RAW_DATA_DIR}|$S3_RAW_DATA_DIR|g" \
-e "s|\${S3_DWH_DIR}|$S3_DWH_DIR|g" \
/spark/conf/spark-defaults.conf.template > /spark/conf/spark-defaults.conf

# Execute the passed command
exec "$@"
114 changes: 79 additions & 35 deletions .github/workflows/spark_deployment/spark-defaults.conf
Original file line number Diff line number Diff line change
@@ -1,44 +1,88 @@
# Basic Spark Configuration
spark.master spark://spark-master:7077

spark.sql.warehouse.dir s3a://dbt-spark-iceberg/github-integration-testing
# Memory Configuration for GitHub Actions Runner (16GB RAM)
spark.driver.memory 4g
spark.executor.memory 6g
spark.memory.offHeap.enabled true
spark.memory.offHeap.size 2g
spark.memory.fraction 0.8
spark.memory.storageFraction 0.3

# JVM Options
spark.driver.extraJavaOptions -XX:+UseG1GC -XX:+UseCompressedOops
spark.executor.extraJavaOptions -XX:+UseG1GC -XX:+UseCompressedOops

# Executor Configuration (4 cores)
spark.executor.cores 4
spark.executor.instances 1
spark.default.parallelism 8
spark.sql.shuffle.partitions 8

# Performance Optimization
spark.sql.adaptive.enabled true
spark.sql.adaptive.coalescePartitions.enabled true
spark.sql.adaptive.localShuffleReader.enabled true
spark.sql.adaptive.skewJoin.enabled true
spark.sql.adaptive.advisoryPartitionSizeInBytes 64m

# Storage Optimization
spark.local.dir /tmp/spark-temp
spark.disk.spillSize 512m
spark.sql.files.maxPartitionBytes 67108864
spark.sql.inMemoryColumnarStorage.compressed true

# Network and Shuffle Settings
spark.shuffle.compress true
spark.shuffle.spill.compress true
spark.io.compression.codec lz4
spark.io.compression.lz4.blockSize 32k

# Catalog Configuration
spark.sql.catalog.glue org.apache.iceberg.spark.SparkCatalog
spark.sql.catalog.glue.catalog-impl org.apache.iceberg.aws.glue.GlueCatalog
spark.sql.catalog.glue.warehouse s3a://dbt-spark-iceberg/github-integration-testing
spark.sql.catalog.glue.io-impl org.apache.iceberg.aws.s3.S3FileIO
spark.sql.defaultCatalog glue
spark.sql.catalog.glue.database dbt-spark-iceberg

# S3 Configuration
spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem
spark.hadoop.fs.s3a.access.key <AWS_ACCESS_KEY_ID>
spark.hadoop.fs.s3a.secret.key <AWS_SECRET_ACCESS_KEY>
spark.hadoop.fs.s3a.endpoint s3.eu-west-1.amazonaws.com
spark.hadoop.fs.s3a.access.key ${AWS_ACCESS_KEY_ID}
spark.hadoop.fs.s3a.secret.key ${AWS_SECRET_ACCESS_KEY}
spark.hadoop.fs.s3a.endpoint s3.${AWS_REGION}.amazonaws.com
spark.hadoop.fs.s3a.path.style.access true
spark.hadoop.fs.s3a.region eu-west-1
spark.hadoop.fs.s3a.aws.region eu-west-1

# Enabling AWS SDK V4 signing (required for regions launched after January 2014)
spark.hadoop.com.amazonaws.services.s3.enableV4 true
spark.hadoop.fs.s3a.aws.credentials.provider org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider

# Hive Metastore Configuration (using AWS Glue)
spark.hadoop.hive.metastore.client.factory.class com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory

# Thrift Server Configuration for better performance in concurrent environments
spark.sql.hive.thriftServer.singleSession false
spark.sql.hive.thriftServer.async true
# spark.sql.hive.thriftServer.maxWorkerThreads 100
# spark.sql.hive.thriftServer.minWorkerThreads 50
# spark.sql.hive.thriftServer.workerQueue.size 2000

# Memory and Performance Tuning
# spark.driver.memory 2g
# spark.executor.memory 3g
# spark.worker.memory 4g
spark.network.timeout 600s
spark.sql.broadcastTimeout 600s
spark.sql.adaptive.enabled true
spark.serializer org.apache.spark.serializer.KryoSerializer

# Logging and Debugging
spark.eventLog.enabled true
spark.eventLog.dir /tmp/spark-events
spark.hadoop.fs.s3a.region ${AWS_REGION}

# S3 Performance Optimization
spark.hadoop.fs.s3a.connection.maximum 50
spark.hadoop.fs.s3a.connection.timeout 30000
spark.hadoop.fs.s3a.attempts.maximum 10
spark.hadoop.fs.s3a.connection.establish.timeout 30000
spark.hadoop.fs.s3a.readahead.range 128K
spark.hadoop.fs.s3a.impl.disable.cache false
spark.hadoop.fs.s3a.buffer.dir /tmp/s3a

# Development Optimizations
spark.sql.execution.arrow.pyspark.enabled true
spark.sql.execution.arrow.maxRecordsPerBatch 10000
spark.ui.port 4040
spark.ui.retainedJobs 50
spark.ui.retainedStages 50
spark.ui.retainedTasks 50

# Warehouse Configuration
spark.sql.warehouse.dir s3a://dbt-spark-iceberg/github-integration-testing
spark.sql.catalog.glue.warehouse s3a://dbt-spark-iceberg/github-integration-testing/unified
spark.sql.catalog.glue.database unified

# Thrift Server Settings
spark.sql.hive.thriftServer.singleSession true
spark.sql.hive.thriftServer.async true
spark.sql.hive.thriftServer.maxWorkerThreads 4
spark.sql.hive.thriftServer.workerQueue.size 100

# Window Operations
spark.sql.window.exec.buffer.in.memory.threshold 50000
spark.sql.window.exec.buffer.spill.threshold 100000

# Join Optimizations
spark.sql.adaptive.skewJoin.enabled true
6 changes: 0 additions & 6 deletions integration_tests/ci/profiles.yml
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,4 @@ integration_tests:
method: thrift
host: "{{ env_var('SPARK_MASTER_HOST', 'localhost') }}"
port: 10000
user: "{{ env_var('SPARK_USER', 'spark') }}"
schema: "{{ env_var('SPARK_SCHEMA', 'default') }}"
connect_retries: 5
connect_timeout: 60
threads: 1
vars:
snowplow__datalake_file_format: iceberg
Loading