-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
spannerspark: initial code skeleton for reads (#31)
This code implements the basis of the Cloud Spanner->Spark connector. Current functionality ensures that you can create a DataFrame<Row> and then invoke .printSchema()
- Loading branch information
Showing
20 changed files
with
1,550 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
/* | ||
* Copyright 2007-present the original author or authors. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
import java.net.*; | ||
import java.io.*; | ||
import java.nio.channels.*; | ||
import java.util.Properties; | ||
|
||
public class MavenWrapperDownloader { | ||
|
||
private static final String WRAPPER_VERSION = "0.5.6"; | ||
/** | ||
* Default URL to download the maven-wrapper.jar from, if no 'downloadUrl' is provided. | ||
*/ | ||
private static final String DEFAULT_DOWNLOAD_URL = "https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/" | ||
+ WRAPPER_VERSION + "/maven-wrapper-" + WRAPPER_VERSION + ".jar"; | ||
|
||
/** | ||
* Path to the maven-wrapper.properties file, which might contain a downloadUrl property to | ||
* use instead of the default one. | ||
*/ | ||
private static final String MAVEN_WRAPPER_PROPERTIES_PATH = | ||
".mvn/wrapper/maven-wrapper.properties"; | ||
|
||
/** | ||
* Path where the maven-wrapper.jar will be saved to. | ||
*/ | ||
private static final String MAVEN_WRAPPER_JAR_PATH = | ||
".mvn/wrapper/maven-wrapper.jar"; | ||
|
||
/** | ||
* Name of the property which should be used to override the default download url for the wrapper. | ||
*/ | ||
private static final String PROPERTY_NAME_WRAPPER_URL = "wrapperUrl"; | ||
|
||
public static void main(String args[]) { | ||
System.out.println("- Downloader started"); | ||
File baseDirectory = new File(args[0]); | ||
System.out.println("- Using base directory: " + baseDirectory.getAbsolutePath()); | ||
|
||
// If the maven-wrapper.properties exists, read it and check if it contains a custom | ||
// wrapperUrl parameter. | ||
File mavenWrapperPropertyFile = new File(baseDirectory, MAVEN_WRAPPER_PROPERTIES_PATH); | ||
String url = DEFAULT_DOWNLOAD_URL; | ||
if(mavenWrapperPropertyFile.exists()) { | ||
FileInputStream mavenWrapperPropertyFileInputStream = null; | ||
try { | ||
mavenWrapperPropertyFileInputStream = new FileInputStream(mavenWrapperPropertyFile); | ||
Properties mavenWrapperProperties = new Properties(); | ||
mavenWrapperProperties.load(mavenWrapperPropertyFileInputStream); | ||
url = mavenWrapperProperties.getProperty(PROPERTY_NAME_WRAPPER_URL, url); | ||
} catch (IOException e) { | ||
System.out.println("- ERROR loading '" + MAVEN_WRAPPER_PROPERTIES_PATH + "'"); | ||
} finally { | ||
try { | ||
if(mavenWrapperPropertyFileInputStream != null) { | ||
mavenWrapperPropertyFileInputStream.close(); | ||
} | ||
} catch (IOException e) { | ||
// Ignore ... | ||
} | ||
} | ||
} | ||
System.out.println("- Downloading from: " + url); | ||
|
||
File outputFile = new File(baseDirectory.getAbsolutePath(), MAVEN_WRAPPER_JAR_PATH); | ||
if(!outputFile.getParentFile().exists()) { | ||
if(!outputFile.getParentFile().mkdirs()) { | ||
System.out.println( | ||
"- ERROR creating output directory '" + outputFile.getParentFile().getAbsolutePath() + "'"); | ||
} | ||
} | ||
System.out.println("- Downloading to: " + outputFile.getAbsolutePath()); | ||
try { | ||
downloadFileFromURL(url, outputFile); | ||
System.out.println("Done"); | ||
System.exit(0); | ||
} catch (Throwable e) { | ||
System.out.println("- Error downloading"); | ||
e.printStackTrace(); | ||
System.exit(1); | ||
} | ||
} | ||
|
||
private static void downloadFileFromURL(String urlString, File destination) throws Exception { | ||
if (System.getenv("MVNW_USERNAME") != null && System.getenv("MVNW_PASSWORD") != null) { | ||
String username = System.getenv("MVNW_USERNAME"); | ||
char[] password = System.getenv("MVNW_PASSWORD").toCharArray(); | ||
Authenticator.setDefault(new Authenticator() { | ||
@Override | ||
protected PasswordAuthentication getPasswordAuthentication() { | ||
return new PasswordAuthentication(username, password); | ||
} | ||
}); | ||
} | ||
URL website = new URL(urlString); | ||
ReadableByteChannel rbc; | ||
rbc = Channels.newChannel(website.openStream()); | ||
FileOutputStream fos = new FileOutputStream(destination); | ||
fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE); | ||
fos.close(); | ||
rbc.close(); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.8.4/apache-maven-3.8.4-bin.zip | ||
wrapperUrl=https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,47 @@ | ||
# spark-spanner-connector | ||
|
||
## Pre-requisites | ||
- [ ] Ensure that [Databoost is enabled for your Cloud Spanner database](https://cloud.google.com/spanner/docs/databoost/databoost-applications#before_you_begin) | ||
|
||
## Running tests locally | ||
Please run the [Cloud Spanner Emulator](https://cloud.google.com/spanner/docs/emulator) locally. | ||
|
||
Before running `mvn`, please set the variable `SPANNER_EMULATOR_HOST=localhost:9090` | ||
|
||
## Compiling the JAR | ||
To compile it against Spark 3.1, Please run | ||
```shell | ||
./mvnw install -P3.1 | ||
``` | ||
|
||
## Submitting the job to Google Dataproc | ||
```shell | ||
gcloud dataproc jobs submit pyspark --cluster "spanner-spark-cluster" \ | ||
--jars=./spark-3.1-spanner/target/spark-3.1-spanner-0.0.1-SNAPSHOT.jar \ | ||
--region us-central1 examples/SpannerSpark.py | ||
``` | ||
|
||
## Submitting the Spark job locally | ||
```shell | ||
./bin/spark-shell --jars \ | ||
local:/spark-spanner-connector/spark-3.1-spanner-lib/target/spark-3.1-spanner-lib-0.0.1-SNAPSHOT.jar | ||
``` | ||
which will pull up a spark shell and you can run it by passing in the options | ||
to create the connection to Cloud Spanner. | ||
|
||
### Variables needed | ||
|
||
Variable|Comments | ||
---|--- | ||
projectId|The projectID containing the Cloud Spanner database | ||
instanceId|The instanceID of the Cloud Spanner database | ||
databaseId|The databaseID of the Cloud Spanner database | ||
table|The Table of the Cloud Spanner database that you are reading from | ||
|
||
|
||
### Spark shell example | ||
```shell | ||
var df = spark.read.format("cloud-spanner").option("table", "ATable").option("projectId", PROJECT_ID).option("instanceId", INSTANCE_ID).option("databaseId", DATABASE_ID).load() | ||
df.show() | ||
df.printSchema() | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
// Copyright 2023 Google LLC | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
package com.google.cloud.spark.spanner.examples; | ||
|
||
import org.apache.spark.sql.Dataset; | ||
import org.apache.spark.sql.Row; | ||
import org.apache.spark.sql.SparkSession; | ||
|
||
public class SpannerSpark { | ||
public static void main(String[] args) { | ||
SparkSession spark = SparkSession | ||
.builder() | ||
.appName("cloud spanner for census 2020") | ||
.getOrCreate(); | ||
|
||
|
||
Dataset<Row> df = spark.read() | ||
.format("cloud-spanner") | ||
.option("table", "people") | ||
.option("projectId", System.getenv("SPANNER_SPARK_PROJECT")) | ||
.option("instanceId", System.getenv("SPANNER_SPARK_INSTANCE")) | ||
.option("database", System.getenv("SPANNER_SPARK_DATABASE")) | ||
.load(); | ||
df.show(); | ||
df.printSchema(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
#/usr/bin/env python | ||
|
||
# Copyright 2023 Google LLC. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from pyspark.sql import SparkSession | ||
|
||
def main(): | ||
spark = SparkSession.builder.appName("Query Spanner").getOrCreate() | ||
df = spark.read.format("cloud-spanner").option("table", "ATable").load() | ||
df.show() | ||
df.printSchema() | ||
|
||
if __name__ == '__main__': | ||
main() |
Oops, something went wrong.