spannerspark: initial code skeleton for reads (#31)

This code implements the basis of the Cloud Spanner->Spark connector. Current functionality ensures that you can create a DataFrame<Row> and then invoke .printSchema()
GoogleCloudDataproc · Aug 11, 2023 · 2ed95b0 · 2ed95b0
1 parent de3b985
commit 2ed95b0
Show file tree

Hide file tree

Showing 20 changed files with 1,550 additions and 4 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -26,3 +26,8 @@ jobs:
           cache: 'maven'
       - name: Run the Maven verify phase
         run: mvn --batch-mode --update-snapshots verify
+
+        env:
+          JOB_TYPE: test
+          SPANNER_EMULATOR_HOST: localhost:9010
+          GOOGLE_CLOUD_PROJECT: spark-spanner-test
diff --git a/.mvn/wrapper/MavenWrapperDownloader.java b/.mvn/wrapper/MavenWrapperDownloader.java
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2007-present the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.net.*;
+import java.io.*;
+import java.nio.channels.*;
+import java.util.Properties;
+
+public class MavenWrapperDownloader {
+
+    private static final String WRAPPER_VERSION = "0.5.6";
+    /**
+     * Default URL to download the maven-wrapper.jar from, if no 'downloadUrl' is provided.
+     */
+    private static final String DEFAULT_DOWNLOAD_URL = "https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/"
+        + WRAPPER_VERSION + "/maven-wrapper-" + WRAPPER_VERSION + ".jar";
+
+    /**
+     * Path to the maven-wrapper.properties file, which might contain a downloadUrl property to
+     * use instead of the default one.
+     */
+    private static final String MAVEN_WRAPPER_PROPERTIES_PATH =
+            ".mvn/wrapper/maven-wrapper.properties";
+
+    /**
+     * Path where the maven-wrapper.jar will be saved to.
+     */
+    private static final String MAVEN_WRAPPER_JAR_PATH =
+            ".mvn/wrapper/maven-wrapper.jar";
+
+    /**
+     * Name of the property which should be used to override the default download url for the wrapper.
+     */
+    private static final String PROPERTY_NAME_WRAPPER_URL = "wrapperUrl";
+
+    public static void main(String args[]) {
+        System.out.println("- Downloader started");
+        File baseDirectory = new File(args[0]);
+        System.out.println("- Using base directory: " + baseDirectory.getAbsolutePath());
+
+        // If the maven-wrapper.properties exists, read it and check if it contains a custom
+        // wrapperUrl parameter.
+        File mavenWrapperPropertyFile = new File(baseDirectory, MAVEN_WRAPPER_PROPERTIES_PATH);
+        String url = DEFAULT_DOWNLOAD_URL;
+        if(mavenWrapperPropertyFile.exists()) {
+            FileInputStream mavenWrapperPropertyFileInputStream = null;
+            try {
+                mavenWrapperPropertyFileInputStream = new FileInputStream(mavenWrapperPropertyFile);
+                Properties mavenWrapperProperties = new Properties();
+                mavenWrapperProperties.load(mavenWrapperPropertyFileInputStream);
+                url = mavenWrapperProperties.getProperty(PROPERTY_NAME_WRAPPER_URL, url);
+            } catch (IOException e) {
+                System.out.println("- ERROR loading '" + MAVEN_WRAPPER_PROPERTIES_PATH + "'");
+            } finally {
+                try {
+                    if(mavenWrapperPropertyFileInputStream != null) {
+                        mavenWrapperPropertyFileInputStream.close();
+                    }
+                } catch (IOException e) {
+                    // Ignore ...
+                }
+            }
+        }
+        System.out.println("- Downloading from: " + url);
+
+        File outputFile = new File(baseDirectory.getAbsolutePath(), MAVEN_WRAPPER_JAR_PATH);
+        if(!outputFile.getParentFile().exists()) {
+            if(!outputFile.getParentFile().mkdirs()) {
+                System.out.println(
+                        "- ERROR creating output directory '" + outputFile.getParentFile().getAbsolutePath() + "'");
+            }
+        }
+        System.out.println("- Downloading to: " + outputFile.getAbsolutePath());
+        try {
+            downloadFileFromURL(url, outputFile);
+            System.out.println("Done");
+            System.exit(0);
+        } catch (Throwable e) {
+            System.out.println("- Error downloading");
+            e.printStackTrace();
+            System.exit(1);
+        }
+    }
+
+    private static void downloadFileFromURL(String urlString, File destination) throws Exception {
+        if (System.getenv("MVNW_USERNAME") != null && System.getenv("MVNW_PASSWORD") != null) {
+            String username = System.getenv("MVNW_USERNAME");
+            char[] password = System.getenv("MVNW_PASSWORD").toCharArray();
+            Authenticator.setDefault(new Authenticator() {
+                @Override
+                protected PasswordAuthentication getPasswordAuthentication() {
+                    return new PasswordAuthentication(username, password);
+                }
+            });
+        }
+        URL website = new URL(urlString);
+        ReadableByteChannel rbc;
+        rbc = Channels.newChannel(website.openStream());
+        FileOutputStream fos = new FileOutputStream(destination);
+        fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE);
+        fos.close();
+        rbc.close();
+    }
+
+}
diff --git a/.mvn/wrapper/maven-wrapper.properties b/.mvn/wrapper/maven-wrapper.properties
@@ -0,0 +1,2 @@
+distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.8.4/apache-maven-3.8.4-bin.zip
+wrapperUrl=https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar
diff --git a/README.md b/README.md
@@ -1 +1,47 @@
 # spark-spanner-connector
+
+## Pre-requisites
+- [ ] Ensure that [Databoost is enabled for your Cloud Spanner database](https://cloud.google.com/spanner/docs/databoost/databoost-applications#before_you_begin)
+
+## Running tests locally
+Please run the [Cloud Spanner Emulator](https://cloud.google.com/spanner/docs/emulator) locally.
+
+Before running `mvn`, please set the variable `SPANNER_EMULATOR_HOST=localhost:9090`
+
+## Compiling the JAR
+To compile it against Spark 3.1, Please run
+```shell
+./mvnw install -P3.1
+```
+
+## Submitting the job to Google Dataproc
+```shell
+gcloud dataproc jobs submit pyspark --cluster "spanner-spark-cluster" \
+    --jars=./spark-3.1-spanner/target/spark-3.1-spanner-0.0.1-SNAPSHOT.jar \
+    --region us-central1 examples/SpannerSpark.py
+```
+
+## Submitting the Spark job locally
+```shell
+./bin/spark-shell --jars \
+        local:/spark-spanner-connector/spark-3.1-spanner-lib/target/spark-3.1-spanner-lib-0.0.1-SNAPSHOT.jar
+```
+which will pull up a spark shell and you can run it by passing in the options
+to create the connection to Cloud Spanner.
+
+### Variables needed
+
+Variable|Comments
+---|---
+projectId|The projectID containing the Cloud Spanner database
+instanceId|The instanceID of the Cloud Spanner database
+databaseId|The databaseID of the Cloud Spanner database
+table|The Table of the Cloud Spanner database that you are reading from
+
+
+### Spark shell example
+```shell
+var df = spark.read.format("cloud-spanner").option("table", "ATable").option("projectId", PROJECT_ID).option("instanceId", INSTANCE_ID).option("databaseId", DATABASE_ID).load()
+df.show()
+df.printSchema()
+```
diff --git a/examples/SpannerSpark.java b/examples/SpannerSpark.java
@@ -0,0 +1,39 @@
+// Copyright 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package com.google.cloud.spark.spanner.examples;
+
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+
+public class SpannerSpark {
+    public static void main(String[] args) {
+        SparkSession spark = SparkSession
+            .builder()
+            .appName("cloud spanner for census 2020")
+            .getOrCreate();
+
+
+        Dataset<Row> df = spark.read()
+            .format("cloud-spanner")
+            .option("table", "people")
+            .option("projectId", System.getenv("SPANNER_SPARK_PROJECT"))
+            .option("instanceId", System.getenv("SPANNER_SPARK_INSTANCE"))
+            .option("database", System.getenv("SPANNER_SPARK_DATABASE"))
+            .load();
+        df.show();
+        df.printSchema();
+    }
+}
diff --git a/examples/SpannerSpark.py b/examples/SpannerSpark.py
@@ -0,0 +1,26 @@
+#/usr/bin/env python
+
+# Copyright 2023 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pyspark.sql import SparkSession
+
+def main():
+    spark = SparkSession.builder.appName("Query Spanner").getOrCreate()
+    df = spark.read.format("cloud-spanner").option("table", "ATable").load()
+    df.show()
+    df.printSchema()
+
+if __name__ == '__main__':
+    main()