Skip to content

Commit

Permalink
spannerspark: initial code skeleton for reads (#31)
Browse files Browse the repository at this point in the history
This code implements the basis of the Cloud Spanner->Spark connector.
Current functionality ensures that you can create a DataFrame<Row>
and then invoke .printSchema()
  • Loading branch information
odeke-em authored Aug 11, 2023
1 parent de3b985 commit 2ed95b0
Show file tree
Hide file tree
Showing 20 changed files with 1,550 additions and 4 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,8 @@ jobs:
cache: 'maven'
- name: Run the Maven verify phase
run: mvn --batch-mode --update-snapshots verify

env:
JOB_TYPE: test
SPANNER_EMULATOR_HOST: localhost:9010
GOOGLE_CLOUD_PROJECT: spark-spanner-test
117 changes: 117 additions & 0 deletions .mvn/wrapper/MavenWrapperDownloader.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
/*
* Copyright 2007-present the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.net.*;
import java.io.*;
import java.nio.channels.*;
import java.util.Properties;

public class MavenWrapperDownloader {

private static final String WRAPPER_VERSION = "0.5.6";
/**
* Default URL to download the maven-wrapper.jar from, if no 'downloadUrl' is provided.
*/
private static final String DEFAULT_DOWNLOAD_URL = "https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/"
+ WRAPPER_VERSION + "/maven-wrapper-" + WRAPPER_VERSION + ".jar";

/**
* Path to the maven-wrapper.properties file, which might contain a downloadUrl property to
* use instead of the default one.
*/
private static final String MAVEN_WRAPPER_PROPERTIES_PATH =
".mvn/wrapper/maven-wrapper.properties";

/**
* Path where the maven-wrapper.jar will be saved to.
*/
private static final String MAVEN_WRAPPER_JAR_PATH =
".mvn/wrapper/maven-wrapper.jar";

/**
* Name of the property which should be used to override the default download url for the wrapper.
*/
private static final String PROPERTY_NAME_WRAPPER_URL = "wrapperUrl";

public static void main(String args[]) {
System.out.println("- Downloader started");
File baseDirectory = new File(args[0]);
System.out.println("- Using base directory: " + baseDirectory.getAbsolutePath());

// If the maven-wrapper.properties exists, read it and check if it contains a custom
// wrapperUrl parameter.
File mavenWrapperPropertyFile = new File(baseDirectory, MAVEN_WRAPPER_PROPERTIES_PATH);
String url = DEFAULT_DOWNLOAD_URL;
if(mavenWrapperPropertyFile.exists()) {
FileInputStream mavenWrapperPropertyFileInputStream = null;
try {
mavenWrapperPropertyFileInputStream = new FileInputStream(mavenWrapperPropertyFile);
Properties mavenWrapperProperties = new Properties();
mavenWrapperProperties.load(mavenWrapperPropertyFileInputStream);
url = mavenWrapperProperties.getProperty(PROPERTY_NAME_WRAPPER_URL, url);
} catch (IOException e) {
System.out.println("- ERROR loading '" + MAVEN_WRAPPER_PROPERTIES_PATH + "'");
} finally {
try {
if(mavenWrapperPropertyFileInputStream != null) {
mavenWrapperPropertyFileInputStream.close();
}
} catch (IOException e) {
// Ignore ...
}
}
}
System.out.println("- Downloading from: " + url);

File outputFile = new File(baseDirectory.getAbsolutePath(), MAVEN_WRAPPER_JAR_PATH);
if(!outputFile.getParentFile().exists()) {
if(!outputFile.getParentFile().mkdirs()) {
System.out.println(
"- ERROR creating output directory '" + outputFile.getParentFile().getAbsolutePath() + "'");
}
}
System.out.println("- Downloading to: " + outputFile.getAbsolutePath());
try {
downloadFileFromURL(url, outputFile);
System.out.println("Done");
System.exit(0);
} catch (Throwable e) {
System.out.println("- Error downloading");
e.printStackTrace();
System.exit(1);
}
}

private static void downloadFileFromURL(String urlString, File destination) throws Exception {
if (System.getenv("MVNW_USERNAME") != null && System.getenv("MVNW_PASSWORD") != null) {
String username = System.getenv("MVNW_USERNAME");
char[] password = System.getenv("MVNW_PASSWORD").toCharArray();
Authenticator.setDefault(new Authenticator() {
@Override
protected PasswordAuthentication getPasswordAuthentication() {
return new PasswordAuthentication(username, password);
}
});
}
URL website = new URL(urlString);
ReadableByteChannel rbc;
rbc = Channels.newChannel(website.openStream());
FileOutputStream fos = new FileOutputStream(destination);
fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE);
fos.close();
rbc.close();
}

}
2 changes: 2 additions & 0 deletions .mvn/wrapper/maven-wrapper.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.8.4/apache-maven-3.8.4-bin.zip
wrapperUrl=https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar
46 changes: 46 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1 +1,47 @@
# spark-spanner-connector

## Pre-requisites
- [ ] Ensure that [Databoost is enabled for your Cloud Spanner database](https://cloud.google.com/spanner/docs/databoost/databoost-applications#before_you_begin)

## Running tests locally
Please run the [Cloud Spanner Emulator](https://cloud.google.com/spanner/docs/emulator) locally.

Before running `mvn`, please set the variable `SPANNER_EMULATOR_HOST=localhost:9090`

## Compiling the JAR
To compile it against Spark 3.1, Please run
```shell
./mvnw install -P3.1
```

## Submitting the job to Google Dataproc
```shell
gcloud dataproc jobs submit pyspark --cluster "spanner-spark-cluster" \
--jars=./spark-3.1-spanner/target/spark-3.1-spanner-0.0.1-SNAPSHOT.jar \
--region us-central1 examples/SpannerSpark.py
```

## Submitting the Spark job locally
```shell
./bin/spark-shell --jars \
local:/spark-spanner-connector/spark-3.1-spanner-lib/target/spark-3.1-spanner-lib-0.0.1-SNAPSHOT.jar
```
which will pull up a spark shell and you can run it by passing in the options
to create the connection to Cloud Spanner.

### Variables needed

Variable|Comments
---|---
projectId|The projectID containing the Cloud Spanner database
instanceId|The instanceID of the Cloud Spanner database
databaseId|The databaseID of the Cloud Spanner database
table|The Table of the Cloud Spanner database that you are reading from


### Spark shell example
```shell
var df = spark.read.format("cloud-spanner").option("table", "ATable").option("projectId", PROJECT_ID).option("instanceId", INSTANCE_ID).option("databaseId", DATABASE_ID).load()
df.show()
df.printSchema()
```
39 changes: 39 additions & 0 deletions examples/SpannerSpark.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Copyright 2023 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.google.cloud.spark.spanner.examples;

import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;

public class SpannerSpark {
public static void main(String[] args) {
SparkSession spark = SparkSession
.builder()
.appName("cloud spanner for census 2020")
.getOrCreate();


Dataset<Row> df = spark.read()
.format("cloud-spanner")
.option("table", "people")
.option("projectId", System.getenv("SPANNER_SPARK_PROJECT"))
.option("instanceId", System.getenv("SPANNER_SPARK_INSTANCE"))
.option("database", System.getenv("SPANNER_SPARK_DATABASE"))
.load();
df.show();
df.printSchema();
}
}
26 changes: 26 additions & 0 deletions examples/SpannerSpark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#/usr/bin/env python

# Copyright 2023 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pyspark.sql import SparkSession

def main():
spark = SparkSession.builder.appName("Query Spanner").getOrCreate()
df = spark.read.format("cloud-spanner").option("table", "ATable").load()
df.show()
df.printSchema()

if __name__ == '__main__':
main()
Loading

0 comments on commit 2ed95b0

Please sign in to comment.