Skip to content

Commit

Permalink
add the dataio-test project
Browse files Browse the repository at this point in the history
* add the JavaImplicitConverters trait

* add the SparkSpec trait

* add the SparkStreamingSpec trait

* add the FileSystemSpec trait

* update the documentation
  • Loading branch information
marclamy committed Apr 10, 2024
1 parent 62eeccf commit 00da7a0
Show file tree
Hide file tree
Showing 14 changed files with 526 additions and 72 deletions.
145 changes: 77 additions & 68 deletions build.sbt
Original file line number Diff line number Diff line change
@@ -1,43 +1,40 @@
ThisBuild / versionScheme := Some("strict")

// Build configuration
ThisBuild / scalaVersion := "2.12.12"
ThisBuild / organization := "com.amadeus.dataio"
val sparkVersion = settingKey[String]("The version of Spark used for building.")
ThisBuild / sparkVersion := "3.4.1"
ThisBuild / scalaVersion := "2.12.12"
ThisBuild / organization := "com.amadeus.dataio"
val sparkVersion = settingKey[String]("The version of Spark used for building.")
ThisBuild / sparkVersion := "3.4.1"

// Common dependencies
ThisBuild / libraryDependencies ++= Seq(
// Core
"org.apache.logging.log4j" %% "log4j-api-scala" % "12.0",
"org.apache.logging.log4j" % "log4j-api" % "2.19.0",
"com.typesafe" % "config" % "1.4.0",
"commons-io" % "commons-io" % "2.9.0",
// Spark
"org.apache.spark" %% "spark-sql-kafka-0-10" % sparkVersion.value,
"org.apache.spark" %% "spark-sql" % sparkVersion.value,
"org.apache.spark" %% "spark-core" % sparkVersion.value,
// Tests
"org.scalatest" %% "scalatest" % "3.2.16" % Test,
"org.scalamock" %% "scalamock" % "5.2.0" % Test
)
ThisBuild / libraryDependencies ++= Seq(
// Core
"org.apache.logging.log4j" %% "log4j-api-scala" % "12.0",
"org.apache.logging.log4j" % "log4j-api" % "2.19.0",
"com.typesafe" % "config" % "1.4.0",
"commons-io" % "commons-io" % "2.9.0",
// Spark
"org.apache.spark" %% "spark-sql-kafka-0-10" % sparkVersion.value,
"org.apache.spark" %% "spark-sql" % sparkVersion.value,
"org.apache.spark" %% "spark-core" % sparkVersion.value
)

// Tests configuration
ThisBuild / Test / parallelExecution := false
ThisBuild / Test / publishArtifact := false
ThisBuild / Test / parallelExecution := false
ThisBuild / Test / publishArtifact := false

// Publication configuration
ThisBuild / publishTo := Some("GitHub Packages" at "https://maven.pkg.github.com/AmadeusITGroup/dataio-framework")
ThisBuild / credentials += Credentials(
"GitHub Package Registry",
"maven.pkg.github.com",
"",
sys.env.getOrElse("GITHUB_REGISTRY_TOKEN", "")
)
ThisBuild / publishMavenStyle := true
ThisBuild / pomIncludeRepository := { _ => true }
ThisBuild / pomExtra :=
<url>https://github.com/AmadeusITGroup/dataio-framework</url>
ThisBuild / publishTo := Some("GitHub Packages" at "https://maven.pkg.github.com/AmadeusITGroup/dataio-framework")
ThisBuild / credentials += Credentials(
"GitHub Package Registry",
"maven.pkg.github.com",
"",
sys.env.getOrElse("GITHUB_REGISTRY_TOKEN", "")
)
ThisBuild / publishMavenStyle := true
ThisBuild / pomIncludeRepository := { _ => true }
ThisBuild / pomExtra :=
<url>https://github.com/AmadeusITGroup/dataio-framework</url>
<licenses>
<license>
<name>Apache License 2.0</name>
Expand All @@ -46,45 +43,57 @@ ThisBuild / versionScheme := Some("strict")
</licenses>

// Release configuration
import ReleaseTransformations._
import ReleaseTransformations._

releaseVersionBump := sbtrelease.Version.Bump.Minor
releaseProcess := Seq[ReleaseStep](
checkSnapshotDependencies,
inquireVersions,
runClean,
runTest,
setReleaseVersion,
commitReleaseVersion,
tagRelease,
publishArtifacts,
setNextVersion,
commitNextVersion,
pushChanges
)
releaseVersionBump := sbtrelease.Version.Bump.Minor
releaseProcess := Seq[ReleaseStep](
checkSnapshotDependencies,
inquireVersions,
runClean,
runTest,
setReleaseVersion,
commitReleaseVersion,
tagRelease,
publishArtifacts,
setNextVersion,
commitNextVersion,
pushChanges
)

// Projects configuration
lazy val root = (project in file("."))
.aggregate(core)
.settings(
publishArtifact := false
lazy val root = (project in file("."))
.aggregate(core, test)
.settings(
publishArtifact := false
)

lazy val core = (project in file("core"))
.settings(
name := "dataio-core",
libraryDependencies ++= Seq(
// Distribution
"javax.mail" % "mail" % "1.4.7",
// Input / Output
"com.crealytics" %% "spark-excel" % s"${sparkVersion.value}_0.19.0",
"org.elasticsearch" %% "elasticsearch-spark-30" % "8.4.3"
exclude ("org.scala-lang", "scala-library")
exclude ("org.scala-lang", "scala-reflect")
exclude ("org.slf4j", "slf4j-api")
exclude ("org.apache.spark", "spark-core_" + scalaVersion.value.substring(0, 4))
exclude ("org.apache.spark", "spark-sql_" + scalaVersion.value.substring(0, 4))
exclude ("org.apache.spark", "spark-catalyst_" + scalaVersion.value.substring(0, 4))
exclude ("org.apache.spark", "spark-streaming_" + scalaVersion.value.substring(0, 4)),
// Tests
"org.scalatest" %% "scalatest" % "3.2.16" % Test,
"org.scalamock" %% "scalamock" % "5.2.0" % Test
)
)

lazy val core = (project in file("core"))
.settings(
name := "dataio-core",
libraryDependencies ++= Seq(
// Distribution
"javax.mail" % "mail" % "1.4.7",
// Input / Output
"com.crealytics" %% "spark-excel" % s"${sparkVersion.value}_0.19.0",
"org.elasticsearch" %% "elasticsearch-spark-30" % "8.4.3"
exclude ("org.scala-lang", "scala-library")
exclude ("org.scala-lang", "scala-reflect")
exclude ("org.slf4j", "slf4j-api")
exclude ("org.apache.spark", "spark-core_" + scalaVersion.value.substring(0, 4))
exclude ("org.apache.spark", "spark-sql_" + scalaVersion.value.substring(0, 4))
exclude ("org.apache.spark", "spark-catalyst_" + scalaVersion.value.substring(0, 4))
exclude ("org.apache.spark", "spark-streaming_" + scalaVersion.value.substring(0, 4))
)
)
lazy val test = (project in file("test"))
.settings(
name := "dataio-test",
libraryDependencies ++= Seq(
"org.scalatest" %% "scalatest" % "3.2.16",
"org.scalamock" %% "scalamock" % "5.2.0"
)
)
2 changes: 1 addition & 1 deletion docs/content/advanced/advanced.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
title: Advanced
layout: default
has_children: true
nav_order: 6
nav_order: 7
---
# Advanced
2 changes: 1 addition & 1 deletion docs/content/configuration/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
title: Configuration
layout: default
has_children: true
nav_order: 5
nav_order: 6
---
# Configuration

Expand Down
4 changes: 2 additions & 2 deletions docs/content/getting-started.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ nav_order: 2

## Installation

Data I/O was built with Spark 3.3.2 and Scala 2.12. Support for prior versions is not guaranteed.
Data I/O was built and tested with Spark 3.2.1/3.3.2/3.4.1 and Scala 2.12. Support for prior versions is not guaranteed.
{: .warning}

Published releases are available on GitHub Packages, in the AmadeusITGroup repository.
Expand All @@ -27,7 +27,7 @@ Using Maven:
```xml
<dependency>
<groupId>com.amadeus.dataio</groupId>
<artifactId>dataio-framework</artifactId>
<artifactId>dataio-core</artifactId>
<version>x.x.x</version>
</dependency>
```
Expand Down
107 changes: 107 additions & 0 deletions docs/content/tests.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
---
title: Writing tests
layout: default
nav_order: 5
---
# Writing tests
<details open markdown="block">
<summary>
Table of contents
</summary>
{: .text-delta }
1. TOC
{:toc}
</details>

---

Data I/O offers a separate library with utility traits and methods designed to facilitate testing Scala/Spark SQL applications.

## Installation

Published releases are available on GitHub Packages, in the AmadeusITGroup repository.

Using Maven:

```xml
<dependency>
<groupId>com.amadeus.dataio</groupId>
<artifactId>dataio-test</artifactId>
<version>x.x.x</version>
</dependency>
```

## Overview


### Interacting with the file system
The `FileSystemSpec` trait provides the Hadoop `LocalFileSystem` for tests needing direct access to an instance of `FileSystem`.

Example:

```scala

import com.amadeus.dataio.test._
import org.scalatest.flatspec.AnyFlatSpec

case class MyAppTest extends AnyFlatSpec with FileSystemSpec {
"MyAppTest" should "do something" in {
assert(fs.exists("file:///my_file.txt"))
}
}
```


### Interacting with a SparkSession
The `SparkSpec` trait provides a local Spark session and helper functions for Spark tests:
- `getTestName: String`: Returns the test suite's name.
- `collectData(path: String, format: String, schema: Option[String] = None): Array[String])`: Collects data from the file system.

Note that extending this trait, you will have to override the getTestName: String function.

Example:

```scala

import com.amadeus.dataio.test._
import org.scalatest.flatspec.AnyFlatSpec

case class MyAppTest extends AnyFlatSpec with SparkSpec {
override def getTestName = "MyAppTest"

"MyAppTest" should "do something" in {
spark.read.format("csv").load("my_data.csv")
collectData
}
}
```


### Interacting with a Streaming context
The `SparkStreamingSpec` trait provides a local Spark session and helper functions for Spark Streaming tests:
- `enableSparkStreamingSchemaInference(): Unit`: Enables Spark streaming schema inference.
- `collectDataStream(dataFrame: DataFrame): Array[String]`: Collects data from a DataFrame read from a stream using an in-memory sink.


### Implicitly converting Scala Maps and Lists in Java equivalents
It it sometimes necessary to build complex map structures while building `Typesafe Config` objects, requiring redundant Scala-to-Java conversions.

To simplify this, you may extend the `JavaImplicitConverters` trait.

Example:

```scala

import com.amadeus.dataio.test._
import com.typesafe.config.ConfigFactory
import org.scalatest.flatspec.AnyFlatSpec

case class MyAppTest extends AnyFlatSpec with JavaImplicitConverters {
"MyAppTest" should "do something" in {
ConfigFactory.parseMap(
Map("NodeName" -> Seq(Map("Type" -> "com.Entity"), Map("Type" -> "com.Entity")))
)
}
}
```

36 changes: 36 additions & 0 deletions test/src/main/scala/com/amadeus/dataio/test/FileSystemSpec.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package com.amadeus.dataio.test

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.scalatest.{BeforeAndAfter, TestSuite}

/**
* Provides the Hadoop LocalFileSystem for tests needing direct access to an instance of FileSystem.
*
* Provides a dedicated instance initialized before each test and automatically closed after each test, providing as
* much isolation as possible between tests. It also deletes the dataio-test temporary directory (/tmp/dataio-test/) and
* sub-directories, before closing the FileSystem.
*
* e.g.
* {{{
* class MyClassTest extends WordSpec with FileSystemSpec{
* // provided by FileSystemSpec:
* // fs: FileSystem
* // val tmpPath: String = "file:///tmp/dataio-test/"
* }
* }}}
*/
trait FileSystemSpec extends TestSuite with BeforeAndAfter {
val tmpPath = "file:///tmp/dataio-test/"

var fs: FileSystem = _

before {
fs = FileSystem.newInstance(new Configuration())
}

after {
fs.delete(new Path(tmpPath), true)
fs.close()
}
}
Loading

0 comments on commit 00da7a0

Please sign in to comment.