Skip to content

Commit

Permalink
Test it
Browse files Browse the repository at this point in the history
  • Loading branch information
ilias1111 committed Aug 2, 2024
1 parent f5c3f3b commit 86c0720
Show file tree
Hide file tree
Showing 6 changed files with 239 additions and 50 deletions.
104 changes: 54 additions & 50 deletions .github/workflows/test_spark_ci.yml
Original file line number Diff line number Diff line change
@@ -1,59 +1,63 @@
name: Spark Deployment with S3 Iceberg Glue Catalog
name: Spark Deployment

on:
workflow_dispatch:
inputs:
environment:
description: 'Deployment Environment'
required: true
default: 'production'
pull_request:
branches: [main]
push:
branches:
- 'feature/**'
- 'dev'
- 'staging'
- 'template-spark-tests'

env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_REGION: eu-west-1
AWS_DEFAULT_REGION: eu-west-1

jobs:
deploy:
deploy-spark:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v2

- name: Set up JDK 11
uses: actions/setup-java@v2
with:
java-version: '11'
distribution: 'adopt'

- name: Build with Maven
run: mvn clean package

- name: Deploy to Spark
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
S3_BUCKET: ${{ secrets.S3_BUCKET }}
- name: Checkout code
uses: actions/checkout@v2

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1

- name: Download JARs
run: |
./bin/spark-submit \
--packages org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.14.0 \
--conf spark.sql.catalog.glue=org.apache.iceberg.spark.SparkCatalog \
--conf spark.sql.catalog.glue.io.impl=org.apache.iceberg.aws.s3.S3FileIO \
--conf spark.sql.catalog.glue.warehouse=s3://$S3_BUCKET/warehouse \
--class com.example.SparkApp \
target/spark-app.jar
- name: Create Sample Table
env:
S3_BUCKET: ${{ secrets.S3_BUCKET }}
chmod +x ./scripts/download_jars.sh
./scripts/download_jars.sh
- name: Build and start Spark cluster
run: |
./bin/spark-sql \
--conf spark.sql.catalog.glue=org.apache.iceberg.spark.SparkCatalog \
--conf spark.sql.catalog.glue.io.impl=org.apache.iceberg.aws.s3.S3FileIO \
--conf spark.sql.catalog.glue.warehouse=s3://$S3_BUCKET/warehouse \
-e "CREATE TABLE glue.default.users (id INT, name STRING)"
- name: Verify Deployment
env:
S3_BUCKET: ${{ secrets.S3_BUCKET }}
docker-compose -f docker/docker-compose.yml build
docker-compose -f docker/docker-compose.yml up -d
- name: Wait for services to start
run: |
./bin/spark-sql \
--conf spark.sql.catalog.glue=org.apache.iceberg.spark.SparkCatalog \
--conf spark.sql.catalog.glue.io.impl=org.apache.iceberg.aws.s3.S3FileIO \
--conf spark.sql.catalog.glue.warehouse=s3://$S3_BUCKET/warehouse \
-e "SHOW TABLES IN glue.default"
echo "Waiting for Spark services to start..."
sleep 60 # Increased wait time to ensure Thrift server is fully operational
- name: Run network diagnostics
run: |
docker-compose -f docker/docker-compose.yml exec -T spark-master /bin/bash -c "/scripts/start-service.sh"
docker-compose -f docker/docker-compose.yml exec -T spark-worker /bin/bash -c "/scripts/start-service.sh"
docker-compose -f docker/docker-compose.yml exec -T thrift-server /bin/bash -c "/scripts/start-service.sh"
- name: Run test Spark job
run: |
docker-compose -f docker/docker-compose.yml exec -T spark-master /spark/bin/spark-submit --class org.apache.spark.examples.SparkPi \
--master spark://spark-master:7077 \
--deploy-mode client \
/spark/examples/jars/spark-examples*.jar 10
- name: Verify Spark Thrift Server
run: |
docker-compose -f docker/docker-compose.yml exec -T thrift-server /spark/bin/beeline -u "jdbc:hive2://localhost:10000" -e "SHOW DATABASES; SHOW TABLES IN default;"
- name: Cleanup
if: always()
run: docker-compose -f docker/docker-compose.yml down
34 changes: 34 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
FROM openjdk:11-jre-slim

# Set environment variables
ENV SPARK_VERSION=3.5.1
ENV HADOOP_VERSION=3.3.4
ENV ICEBERG_VERSION=1.4.2
ENV AWS_SDK_VERSION=1.12.581

# Install necessary tools
RUN apt-get update && apt-get install -y curl wget procps rsync ssh

# Download and install Spark
RUN wget https://downloads.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz && \
tar -xvzf spark-${SPARK_VERSION}-bin-hadoop3.tgz && \
mv spark-${SPARK_VERSION}-bin-hadoop3 /spark && \
rm spark-${SPARK_VERSION}-bin-hadoop3.tgz

# Set Spark environment variables
ENV SPARK_HOME=/spark
ENV PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin

# Download necessary JARs
RUN mkdir -p /spark/jars && \
wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/${ICEBERG_VERSION}/iceberg-spark-runtime-3.5_2.12-${ICEBERG_VERSION}.jar -O /spark/jars/iceberg-spark-runtime.jar && \
wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar -O /spark/jars/iceberg-aws-bundle.jar && \
wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -O /spark/jars/hadoop-aws.jar && \
wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar -O /spark/jars/aws-java-sdk-bundle.jar

# Create directory for Spark events
RUN mkdir -p /tmp/spark-events

WORKDIR /spark

CMD ["bash"]
66 changes: 66 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
version: '3'

networks:
spark-network:
driver: bridge

services:
spark-master:
build: .
command: ["/bin/bash", "-c", "/spark/sbin/start-master.sh -h spark-master --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.master.Master-1-*.out"]
hostname: spark-master
ports:
- '8080:8080'
- '7077:7077'
environment:
- SPARK_LOCAL_IP=spark-master
- SPARK_MASTER_HOST=spark-master
- SPARK_MASTER_PORT=7077
- SPARK_MASTER_OPTS="-Dspark.driver.memory=2g"
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
- AWS_REGION=eu-west-1
- AWS_DEFAULT_REGION=eu-west-1
volumes:
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf
networks:
- spark-network

spark-worker:
build: .
command: ["/bin/bash", "-c", "sleep 10 && /spark/sbin/start-worker.sh spark://spark-master:7077 --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.worker.Worker-*.out"]
depends_on:
- spark-master
environment:
- SPARK_WORKER_CORES=2
- SPARK_WORKER_MEMORY=4G
- SPARK_EXECUTOR_MEMORY=3G
- SPARK_LOCAL_IP=spark-worker
- SPARK_MASTER=spark://spark-master:7077
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
- AWS_REGION=eu-west-1
- AWS_DEFAULT_REGION=eu-west-1
volumes:
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf
networks:
- spark-network

thrift-server:
build: .
command: ["/bin/bash", "-c", "sleep 30 && /spark/sbin/start-thriftserver.sh --master spark://spark-master:7077 --driver-memory 2g --executor-memory 3g --hiveconf hive.server2.thrift.port=10000 --hiveconf hive.server2.thrift.bind.host=0.0.0.0 --conf spark.sql.hive.thriftServer.async=true --conf spark.sql.hive.thriftServer.workerQueue.size=2000 --conf spark.sql.hive.thriftServer.maxWorkerThreads=100 --conf spark.sql.hive.thriftServer.minWorkerThreads=50 && tail -f /spark/logs/spark--org.apache.spark.sql.hive.thriftserver.HiveThriftServer2-*.out"]
ports:
- '10000:10000'
depends_on:
- spark-master
- spark-worker
environment:
- SPARK_LOCAL_IP=thrift-server
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
- AWS_REGION=eu-west-1
- AWS_DEFAULT_REGION=eu-west-1
volumes:
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf
networks:
- spark-network
28 changes: 28 additions & 0 deletions download_jars.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash

JARS_DIR="./jars"
mkdir -p $JARS_DIR

download_jar() {
local url=$1
local filename=$(basename $url)
echo "Downloading $filename..."
curl -L $url -o "$JARS_DIR/$filename"
}

# AWS Glue Iceberg connector
download_jar "https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/1.3.1/iceberg-aws-bundle-1.3.1.jar"

# AWS SDK bundle
download_jar "https://repo1.maven.org/maven2/software/amazon/awssdk/aws-sdk-java/2.20.18/aws-sdk-java-2.20.18.jar"

# Iceberg Spark runtime
download_jar "https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.3_2.12/1.3.1/iceberg-spark-runtime-3.3_2.12-1.3.1.jar"

# Spark SQL AWS bundle
download_jar "https://repo1.maven.org/maven2/org/apache/spark/spark-sql-aws_2.12/3.3.2/spark-sql-aws_2.12-3.3.2.jar"

# AWS Java SDK bundle
download_jar "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.1026/aws-java-sdk-bundle-1.11.1026.jar"

echo "All JARs downloaded successfully."
47 changes: 47 additions & 0 deletions spark-defaults.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Spark Master URL
spark.master spark://spark-master:7077

# Directory and Catalog Configuration for Iceberg and S3
spark.sql.warehouse.dir s3a://dbt-spark-iceberg/github-integration-testing
spark.sql.catalog.glue org.apache.iceberg.spark.SparkCatalog
spark.sql.catalog.glue.catalog-impl org.apache.iceberg.aws.glue.GlueCatalog
spark.sql.catalog.glue.warehouse s3a://dbt-spark-iceberg/github-integration-testing
spark.sql.catalog.glue.io-impl org.apache.iceberg.aws.s3.S3FileIO
spark.sql.defaultCatalog glue
spark.sql.catalog.glue.database dbt-spark-iceberg

# S3 Connection Settings
spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem
spark.hadoop.fs.s3a.access.key <AWS_ACCESS_KEY_ID>
spark.hadoop.fs.s3a.secret.key <AWS_SECRET_ACCESS_KEY>
spark.hadoop.fs.s3a.endpoint s3.eu-west-1.amazonaws.com
spark.hadoop.fs.s3a.path.style.access true
spark.hadoop.fs.s3a.region eu-west-1
spark.hadoop.fs.s3a.aws.region eu-west-1

# Enabling AWS SDK V4 signing (required for regions launched after January 2014)
spark.hadoop.com.amazonaws.services.s3.enableV4 true
spark.hadoop.fs.s3a.aws.credentials.provider org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider

# Hive Metastore Configuration (using AWS Glue)
spark.hadoop.hive.metastore.client.factory.class com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory

# Thrift Server Configuration for better performance in concurrent environments
spark.sql.hive.thriftServer.singleSession false
spark.sql.hive.thriftServer.async true
spark.sql.hive.thriftServer.maxWorkerThreads 100
spark.sql.hive.thriftServer.minWorkerThreads 50
spark.sql.hive.thriftServer.workerQueue.size 2000

# Memory and Performance Tuning
spark.driver.memory 2g
spark.executor.memory 3g
spark.worker.memory 4g
spark.network.timeout 600s
spark.sql.broadcastTimeout 600s
spark.sql.adaptive.enabled true
spark.serializer org.apache.spark.serializer.KryoSerializer

# Logging and Debugging
spark.eventLog.enabled true
spark.eventLog.dir /tmp/spark-events
10 changes: 10 additions & 0 deletions start-service.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash

echo "Starting network diagnostics..."
echo "Hostname: $(hostname)"
echo "IP Address: $(hostname -I)"
echo "Pinging spark-master..."
ping -c 4 spark-master

echo "Starting Spark service..."
exec "$@"

0 comments on commit 86c0720

Please sign in to comment.