-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
239 additions
and
50 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,59 +1,63 @@ | ||
name: Spark Deployment with S3 Iceberg Glue Catalog | ||
name: Spark Deployment | ||
|
||
on: | ||
workflow_dispatch: | ||
inputs: | ||
environment: | ||
description: 'Deployment Environment' | ||
required: true | ||
default: 'production' | ||
pull_request: | ||
branches: [main] | ||
push: | ||
branches: | ||
- 'feature/**' | ||
- 'dev' | ||
- 'staging' | ||
- 'template-spark-tests' | ||
|
||
env: | ||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} | ||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | ||
AWS_REGION: eu-west-1 | ||
AWS_DEFAULT_REGION: eu-west-1 | ||
|
||
jobs: | ||
deploy: | ||
deploy-spark: | ||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
- uses: actions/checkout@v2 | ||
|
||
- name: Set up JDK 11 | ||
uses: actions/setup-java@v2 | ||
with: | ||
java-version: '11' | ||
distribution: 'adopt' | ||
|
||
- name: Build with Maven | ||
run: mvn clean package | ||
|
||
- name: Deploy to Spark | ||
env: | ||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} | ||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | ||
S3_BUCKET: ${{ secrets.S3_BUCKET }} | ||
- name: Checkout code | ||
uses: actions/checkout@v2 | ||
|
||
- name: Set up Docker Buildx | ||
uses: docker/setup-buildx-action@v1 | ||
|
||
- name: Download JARs | ||
run: | | ||
./bin/spark-submit \ | ||
--packages org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.14.0 \ | ||
--conf spark.sql.catalog.glue=org.apache.iceberg.spark.SparkCatalog \ | ||
--conf spark.sql.catalog.glue.io.impl=org.apache.iceberg.aws.s3.S3FileIO \ | ||
--conf spark.sql.catalog.glue.warehouse=s3://$S3_BUCKET/warehouse \ | ||
--class com.example.SparkApp \ | ||
target/spark-app.jar | ||
- name: Create Sample Table | ||
env: | ||
S3_BUCKET: ${{ secrets.S3_BUCKET }} | ||
chmod +x ./scripts/download_jars.sh | ||
./scripts/download_jars.sh | ||
- name: Build and start Spark cluster | ||
run: | | ||
./bin/spark-sql \ | ||
--conf spark.sql.catalog.glue=org.apache.iceberg.spark.SparkCatalog \ | ||
--conf spark.sql.catalog.glue.io.impl=org.apache.iceberg.aws.s3.S3FileIO \ | ||
--conf spark.sql.catalog.glue.warehouse=s3://$S3_BUCKET/warehouse \ | ||
-e "CREATE TABLE glue.default.users (id INT, name STRING)" | ||
- name: Verify Deployment | ||
env: | ||
S3_BUCKET: ${{ secrets.S3_BUCKET }} | ||
docker-compose -f docker/docker-compose.yml build | ||
docker-compose -f docker/docker-compose.yml up -d | ||
- name: Wait for services to start | ||
run: | | ||
./bin/spark-sql \ | ||
--conf spark.sql.catalog.glue=org.apache.iceberg.spark.SparkCatalog \ | ||
--conf spark.sql.catalog.glue.io.impl=org.apache.iceberg.aws.s3.S3FileIO \ | ||
--conf spark.sql.catalog.glue.warehouse=s3://$S3_BUCKET/warehouse \ | ||
-e "SHOW TABLES IN glue.default" | ||
echo "Waiting for Spark services to start..." | ||
sleep 60 # Increased wait time to ensure Thrift server is fully operational | ||
- name: Run network diagnostics | ||
run: | | ||
docker-compose -f docker/docker-compose.yml exec -T spark-master /bin/bash -c "/scripts/start-service.sh" | ||
docker-compose -f docker/docker-compose.yml exec -T spark-worker /bin/bash -c "/scripts/start-service.sh" | ||
docker-compose -f docker/docker-compose.yml exec -T thrift-server /bin/bash -c "/scripts/start-service.sh" | ||
- name: Run test Spark job | ||
run: | | ||
docker-compose -f docker/docker-compose.yml exec -T spark-master /spark/bin/spark-submit --class org.apache.spark.examples.SparkPi \ | ||
--master spark://spark-master:7077 \ | ||
--deploy-mode client \ | ||
/spark/examples/jars/spark-examples*.jar 10 | ||
- name: Verify Spark Thrift Server | ||
run: | | ||
docker-compose -f docker/docker-compose.yml exec -T thrift-server /spark/bin/beeline -u "jdbc:hive2://localhost:10000" -e "SHOW DATABASES; SHOW TABLES IN default;" | ||
- name: Cleanup | ||
if: always() | ||
run: docker-compose -f docker/docker-compose.yml down |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
FROM openjdk:11-jre-slim | ||
|
||
# Set environment variables | ||
ENV SPARK_VERSION=3.5.1 | ||
ENV HADOOP_VERSION=3.3.4 | ||
ENV ICEBERG_VERSION=1.4.2 | ||
ENV AWS_SDK_VERSION=1.12.581 | ||
|
||
# Install necessary tools | ||
RUN apt-get update && apt-get install -y curl wget procps rsync ssh | ||
|
||
# Download and install Spark | ||
RUN wget https://downloads.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz && \ | ||
tar -xvzf spark-${SPARK_VERSION}-bin-hadoop3.tgz && \ | ||
mv spark-${SPARK_VERSION}-bin-hadoop3 /spark && \ | ||
rm spark-${SPARK_VERSION}-bin-hadoop3.tgz | ||
|
||
# Set Spark environment variables | ||
ENV SPARK_HOME=/spark | ||
ENV PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin | ||
|
||
# Download necessary JARs | ||
RUN mkdir -p /spark/jars && \ | ||
wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/${ICEBERG_VERSION}/iceberg-spark-runtime-3.5_2.12-${ICEBERG_VERSION}.jar -O /spark/jars/iceberg-spark-runtime.jar && \ | ||
wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar -O /spark/jars/iceberg-aws-bundle.jar && \ | ||
wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -O /spark/jars/hadoop-aws.jar && \ | ||
wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar -O /spark/jars/aws-java-sdk-bundle.jar | ||
|
||
# Create directory for Spark events | ||
RUN mkdir -p /tmp/spark-events | ||
|
||
WORKDIR /spark | ||
|
||
CMD ["bash"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
version: '3' | ||
|
||
networks: | ||
spark-network: | ||
driver: bridge | ||
|
||
services: | ||
spark-master: | ||
build: . | ||
command: ["/bin/bash", "-c", "/spark/sbin/start-master.sh -h spark-master --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.master.Master-1-*.out"] | ||
hostname: spark-master | ||
ports: | ||
- '8080:8080' | ||
- '7077:7077' | ||
environment: | ||
- SPARK_LOCAL_IP=spark-master | ||
- SPARK_MASTER_HOST=spark-master | ||
- SPARK_MASTER_PORT=7077 | ||
- SPARK_MASTER_OPTS="-Dspark.driver.memory=2g" | ||
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} | ||
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} | ||
- AWS_REGION=eu-west-1 | ||
- AWS_DEFAULT_REGION=eu-west-1 | ||
volumes: | ||
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf | ||
networks: | ||
- spark-network | ||
|
||
spark-worker: | ||
build: . | ||
command: ["/bin/bash", "-c", "sleep 10 && /spark/sbin/start-worker.sh spark://spark-master:7077 --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.worker.Worker-*.out"] | ||
depends_on: | ||
- spark-master | ||
environment: | ||
- SPARK_WORKER_CORES=2 | ||
- SPARK_WORKER_MEMORY=4G | ||
- SPARK_EXECUTOR_MEMORY=3G | ||
- SPARK_LOCAL_IP=spark-worker | ||
- SPARK_MASTER=spark://spark-master:7077 | ||
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} | ||
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} | ||
- AWS_REGION=eu-west-1 | ||
- AWS_DEFAULT_REGION=eu-west-1 | ||
volumes: | ||
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf | ||
networks: | ||
- spark-network | ||
|
||
thrift-server: | ||
build: . | ||
command: ["/bin/bash", "-c", "sleep 30 && /spark/sbin/start-thriftserver.sh --master spark://spark-master:7077 --driver-memory 2g --executor-memory 3g --hiveconf hive.server2.thrift.port=10000 --hiveconf hive.server2.thrift.bind.host=0.0.0.0 --conf spark.sql.hive.thriftServer.async=true --conf spark.sql.hive.thriftServer.workerQueue.size=2000 --conf spark.sql.hive.thriftServer.maxWorkerThreads=100 --conf spark.sql.hive.thriftServer.minWorkerThreads=50 && tail -f /spark/logs/spark--org.apache.spark.sql.hive.thriftserver.HiveThriftServer2-*.out"] | ||
ports: | ||
- '10000:10000' | ||
depends_on: | ||
- spark-master | ||
- spark-worker | ||
environment: | ||
- SPARK_LOCAL_IP=thrift-server | ||
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} | ||
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} | ||
- AWS_REGION=eu-west-1 | ||
- AWS_DEFAULT_REGION=eu-west-1 | ||
volumes: | ||
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf | ||
networks: | ||
- spark-network |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#!/bin/bash | ||
|
||
JARS_DIR="./jars" | ||
mkdir -p $JARS_DIR | ||
|
||
download_jar() { | ||
local url=$1 | ||
local filename=$(basename $url) | ||
echo "Downloading $filename..." | ||
curl -L $url -o "$JARS_DIR/$filename" | ||
} | ||
|
||
# AWS Glue Iceberg connector | ||
download_jar "https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/1.3.1/iceberg-aws-bundle-1.3.1.jar" | ||
|
||
# AWS SDK bundle | ||
download_jar "https://repo1.maven.org/maven2/software/amazon/awssdk/aws-sdk-java/2.20.18/aws-sdk-java-2.20.18.jar" | ||
|
||
# Iceberg Spark runtime | ||
download_jar "https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.3_2.12/1.3.1/iceberg-spark-runtime-3.3_2.12-1.3.1.jar" | ||
|
||
# Spark SQL AWS bundle | ||
download_jar "https://repo1.maven.org/maven2/org/apache/spark/spark-sql-aws_2.12/3.3.2/spark-sql-aws_2.12-3.3.2.jar" | ||
|
||
# AWS Java SDK bundle | ||
download_jar "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.1026/aws-java-sdk-bundle-1.11.1026.jar" | ||
|
||
echo "All JARs downloaded successfully." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
# Spark Master URL | ||
spark.master spark://spark-master:7077 | ||
|
||
# Directory and Catalog Configuration for Iceberg and S3 | ||
spark.sql.warehouse.dir s3a://dbt-spark-iceberg/github-integration-testing | ||
spark.sql.catalog.glue org.apache.iceberg.spark.SparkCatalog | ||
spark.sql.catalog.glue.catalog-impl org.apache.iceberg.aws.glue.GlueCatalog | ||
spark.sql.catalog.glue.warehouse s3a://dbt-spark-iceberg/github-integration-testing | ||
spark.sql.catalog.glue.io-impl org.apache.iceberg.aws.s3.S3FileIO | ||
spark.sql.defaultCatalog glue | ||
spark.sql.catalog.glue.database dbt-spark-iceberg | ||
|
||
# S3 Connection Settings | ||
spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem | ||
spark.hadoop.fs.s3a.access.key <AWS_ACCESS_KEY_ID> | ||
spark.hadoop.fs.s3a.secret.key <AWS_SECRET_ACCESS_KEY> | ||
spark.hadoop.fs.s3a.endpoint s3.eu-west-1.amazonaws.com | ||
spark.hadoop.fs.s3a.path.style.access true | ||
spark.hadoop.fs.s3a.region eu-west-1 | ||
spark.hadoop.fs.s3a.aws.region eu-west-1 | ||
|
||
# Enabling AWS SDK V4 signing (required for regions launched after January 2014) | ||
spark.hadoop.com.amazonaws.services.s3.enableV4 true | ||
spark.hadoop.fs.s3a.aws.credentials.provider org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider | ||
|
||
# Hive Metastore Configuration (using AWS Glue) | ||
spark.hadoop.hive.metastore.client.factory.class com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory | ||
|
||
# Thrift Server Configuration for better performance in concurrent environments | ||
spark.sql.hive.thriftServer.singleSession false | ||
spark.sql.hive.thriftServer.async true | ||
spark.sql.hive.thriftServer.maxWorkerThreads 100 | ||
spark.sql.hive.thriftServer.minWorkerThreads 50 | ||
spark.sql.hive.thriftServer.workerQueue.size 2000 | ||
|
||
# Memory and Performance Tuning | ||
spark.driver.memory 2g | ||
spark.executor.memory 3g | ||
spark.worker.memory 4g | ||
spark.network.timeout 600s | ||
spark.sql.broadcastTimeout 600s | ||
spark.sql.adaptive.enabled true | ||
spark.serializer org.apache.spark.serializer.KryoSerializer | ||
|
||
# Logging and Debugging | ||
spark.eventLog.enabled true | ||
spark.eventLog.dir /tmp/spark-events |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
#!/bin/bash | ||
|
||
echo "Starting network diagnostics..." | ||
echo "Hostname: $(hostname)" | ||
echo "IP Address: $(hostname -I)" | ||
echo "Pinging spark-master..." | ||
ping -c 4 spark-master | ||
|
||
echo "Starting Spark service..." | ||
exec "$@" |