From 27a3cc04b807d95a16f2534d81dc1e86d03847c9 Mon Sep 17 00:00:00 2001 From: coachDave Date: Thu, 3 Aug 2023 19:04:49 +0200 Subject: [PATCH] Added Schema reader, completion engine and a demo --- pom.xml | 9 +- .../sdl-schema/sdl-schema-2.5.0.json | 10111 ++++++++++++++++ .../completion/SDLBCompletionEngine.scala | 7 + .../completion/SDLBCompletionEngineImpl.scala | 37 + .../completion/schema/ItemType.scala | 22 + .../completion/schema/SchemaItem.scala | 3 + .../completion/schema/SchemaReader.scala | 4 + .../completion/schema/SchemaReaderImpl.scala | 22 + .../smartdatalake/context/SDLBContext.scala | 2 +- .../context/hocon/HoconParser.scala | 4 +- .../SmartDataLakeTextDocumentService.scala | 68 +- .../utils/MultiLineTransformer.scala | 6 +- .../fixture/{ => hocon}/airport-example.conf | 0 .../fixture/{ => hocon}/basic-example.conf | 0 .../{ => hocon}/with-comments-example.conf | 0 .../{ => hocon}/with-lists-example.conf | 0 .../{ => hocon}/with-multi-lines-example.conf | 0 .../with-multi-lines-flattened-example.conf | 0 .../fixture/sdl-schema/sdl-schema-2.5.0.json | 10111 ++++++++++++++++ src/test/resources/playground/basic.conf | 4 +- src/test/resources/playground/demo.conf | 36 + .../completion/SDLBCompletionEngineSpec.scala | 22 + .../completion/schema/SchemaReaderSpec.scala | 67 + .../context/SDLBContextSpec.scala | 8 +- .../context/hocon/HoconParserSpec.scala | 8 +- .../SmartDataLakeLanguageServerSpec.scala | 2 +- ...SmartDataLakeTextDocumentServiceSpec.scala | 4 +- .../utils/MultiLineTransformerSpec.scala | 6 +- 28 files changed, 20516 insertions(+), 47 deletions(-) create mode 100644 src/main/resources/sdl-schema/sdl-schema-2.5.0.json create mode 100644 src/main/scala/io/smartdatalake/completion/SDLBCompletionEngine.scala create mode 100644 src/main/scala/io/smartdatalake/completion/SDLBCompletionEngineImpl.scala create mode 100644 src/main/scala/io/smartdatalake/completion/schema/ItemType.scala create mode 100644 src/main/scala/io/smartdatalake/completion/schema/SchemaItem.scala create mode 100644 src/main/scala/io/smartdatalake/completion/schema/SchemaReader.scala create mode 100644 src/main/scala/io/smartdatalake/completion/schema/SchemaReaderImpl.scala rename src/test/resources/fixture/{ => hocon}/airport-example.conf (100%) rename src/test/resources/fixture/{ => hocon}/basic-example.conf (100%) rename src/test/resources/fixture/{ => hocon}/with-comments-example.conf (100%) rename src/test/resources/fixture/{ => hocon}/with-lists-example.conf (100%) rename src/test/resources/fixture/{ => hocon}/with-multi-lines-example.conf (100%) rename src/test/resources/fixture/{ => hocon}/with-multi-lines-flattened-example.conf (100%) create mode 100644 src/test/resources/fixture/sdl-schema/sdl-schema-2.5.0.json create mode 100644 src/test/resources/playground/demo.conf create mode 100644 src/test/scala/io/smartdatalake/completion/SDLBCompletionEngineSpec.scala create mode 100644 src/test/scala/io/smartdatalake/completion/schema/SchemaReaderSpec.scala rename src/test/scala/io/smartdatalake/{context => }/languageserver/SmartDataLakeLanguageServerSpec.scala (93%) rename src/test/scala/io/smartdatalake/{context => }/languageserver/SmartDataLakeTextDocumentServiceSpec.scala (87%) rename src/test/scala/io/smartdatalake/{context => }/utils/MultiLineTransformerSpec.scala (86%) diff --git a/pom.xml b/pom.xml index b9511d7..9812aa7 100644 --- a/pom.xml +++ b/pom.xml @@ -23,7 +23,7 @@ 1.8 1.8 UTF-8 - 3.2.1 + 3.3.0 0.21.0 1.4.2 @@ -50,6 +50,13 @@ ${typesafe.version} + + com.lihaoyi + ujson_3 + 3.1.2 + + + diff --git a/src/main/resources/sdl-schema/sdl-schema-2.5.0.json b/src/main/resources/sdl-schema/sdl-schema-2.5.0.json new file mode 100644 index 0000000..ebcf95c --- /dev/null +++ b/src/main/resources/sdl-schema/sdl-schema-2.5.0.json @@ -0,0 +1,10111 @@ +{ + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "version": "2.5.0", + "id": "sdl-schema-2.5.0.json#", + "definitions": { + "ExecutionMode": { + "CustomMode": { + "type": "object", + "properties": { + "type": { + "const": "CustomMode" + }, + "className": { + "type": "string", + "description": "class name implementing trait[[CustomModeLogic]]" + }, + "alternativeOutputId": { + "type": "string", + "description": "optional alternative outputId of DataObject later in the DAG. This replaces the mainOutputId.\nIt can be used to ensure processing over multiple actions in case of errors." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options specified in the configuration for this execution mode" + } + }, + "title": "CustomMode", + "required": [ + "type", + "className" + ], + "additionalProperties": false, + "description": "Execution mode to create custom execution mode logic.\nDefine a function which receives main input&output DataObject and returns execution mode result" + }, + "CustomPartitionMode": { + "type": "object", + "properties": { + "type": { + "const": "CustomPartitionMode" + }, + "className": { + "type": "string", + "description": "class name implementing trait[[CustomPartitionModeLogic]]" + }, + "alternativeOutputId": { + "type": "string", + "description": "optional alternative outputId of DataObject later in the DAG. This replaces the mainOutputId.\nIt can be used to ensure processing all partitions over multiple actions in case of errors." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options specified in the configuration for this execution mode" + } + }, + "title": "CustomPartitionMode", + "required": [ + "type", + "className" + ], + "additionalProperties": false, + "description": "Execution mode to create custom partition execution mode logic.\n\nDefine a function which receives main input&output DataObject and returns partition values to process as`Seq[Map[String,String]]`" + }, + "DataFrameIncrementalMode": { + "type": "object", + "properties": { + "type": { + "const": "DataFrameIncrementalMode" + }, + "compareCol": { + "type": "string", + "description": "a comparable column name existing in mainInput and mainOutput used to identify the delta. Column content should be bigger for newer records." + }, + "alternativeOutputId": { + "type": "string", + "description": "optional alternative outputId of DataObject later in the DAG. This replaces the mainOutputId.\nIt can be used to ensure processing all partitions over multiple actions in case of errors." + }, + "applyCondition": { + "type": "object", + "properties": { + "expression": { + "type": "string", + "description": "Condition formulated as Spark SQL. The attributes available are dependent on the context." + }, + "description": { + "type": "string", + "description": "A textual description of the condition to be shown in error messages." + } + }, + "title": "Condition", + "required": [ + "expression" + ], + "additionalProperties": false, + "description": "Definition of a Spark SQL condition with description.\nThis is used for example to define failConditions of[[PartitionDiffMode]] ." + } + }, + "title": "DataFrameIncrementalMode", + "required": [ + "type", + "compareCol" + ], + "additionalProperties": false, + "description": "Compares max entry in \\\"compare column\\\" between mainOutput and mainInput and incrementally loads the delta.\nThis mode works only with SparkSubFeeds. The filter is not propagated to following actions." + }, + "DataObjectStateIncrementalMode": { + "type": "object", + "properties": { + "type": { + "const": "DataObjectStateIncrementalMode" + } + }, + "title": "DataObjectStateIncrementalMode", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "An execution mode for incremental processing by remembering DataObjects state from last increment." + }, + "FailIfNoPartitionValuesMode": { + "type": "object", + "properties": { + "type": { + "const": "FailIfNoPartitionValuesMode" + } + }, + "title": "FailIfNoPartitionValuesMode", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "An execution mode which just validates that partition values are given.\nNote: For start nodes of the DAG partition values can be defined by command line, for subsequent nodes partition values are passed on from previous nodes." + }, + "FileIncrementalMoveMode": { + "type": "object", + "properties": { + "type": { + "const": "FileIncrementalMoveMode" + }, + "archivePath": { + "type": "string", + "description": "if an archive directory is configured, files are moved into that directory instead of deleted, preserving partition layout.\nIf this is a relative path, e.g. \\\"_archive\\\", it is appended after the path of the DataObject.\nIf this is an absolute path it replaces the path of the DataObject." + } + }, + "title": "FileIncrementalMoveMode", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Execution mode to incrementally process file-based DataObjects, e.g. FileRefDataObjects and SparkFileDataObjects.\nFor FileRefDataObjects:\n- All existing files in the input DataObject are processed and removed (deleted or archived) after processing\n- Input partition values are applied to search for files and also used as output partition values\nFor SparkFileDataObjects:\n- Files processed are read from the DataFrames execution plan and removed (deleted or archived) after processing.\nNote that is only correct if no additional filters are applied in the DataFrame.\nA better implementation would be to observe files by a custom metric. Unfortunately there is a problem in Spark with that, see also[[CollectSetDeterministic]] \n- Partition values preserved." + }, + "KafkaStateIncrementalMode": { + "type": "object", + "properties": { + "type": { + "const": "KafkaStateIncrementalMode" + }, + "delayedMaxTimestampExpr": { + "type": "string", + "description": "Optional expression to define a delay to read latest offsets from Kafka. The expression has to return a timestamp which is used to select ending offsets to read from Kafka.\nDefine a spark sql expression working with the attributes of[[DefaultExpressionData]] returning a timestamp.\nDefault is to read latest offsets existing in Kafka." + } + }, + "title": "KafkaStateIncrementalMode", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "A special incremental execution mode for Kafka Inputs, remembering the state from the last increment through the Kafka Consumer, e.g. committed offsets." + }, + "PartitionDiffMode": { + "type": "object", + "properties": { + "type": { + "const": "PartitionDiffMode" + }, + "partitionColNb": { + "type": "integer", + "description": "optional number of partition columns to use as a common \\'init\\'." + }, + "alternativeOutputId": { + "type": "string", + "description": "optional alternative outputId of DataObject later in the DAG. This replaces the mainOutputId.\nIt can be used to ensure processing all partitions over multiple actions in case of errors." + }, + "nbOfPartitionValuesPerRun": { + "type": "integer", + "description": "optional restriction of the number of partition values per run." + }, + "applyCondition": { + "type": "string", + "description": "Condition to decide if execution mode should be applied or not. Define a spark sql expression working with attributes of[[DefaultExecutionModeExpressionData]] returning a boolean.\nDefault is to apply the execution mode if given partition values (partition values from command line or passed from previous action) are empty." + }, + "failCondition": { + "type": "string" + }, + "failConditions": { + "type": "array", + "items": { + "type": "object", + "properties": { + "expression": { + "type": "string", + "description": "Condition formulated as Spark SQL. The attributes available are dependent on the context." + }, + "description": { + "type": "string", + "description": "A textual description of the condition to be shown in error messages." + } + }, + "title": "Condition", + "required": [ + "expression" + ], + "additionalProperties": false, + "description": "Definition of a Spark SQL condition with description.\nThis is used for example to define failConditions of[[PartitionDiffMode]] ." + }, + "description": "List of conditions to fail application of execution mode if true. Define as spark sql expressions working with attributes of[[PartitionDiffModeExpressionData]] returning a boolean.\nDefault is that the application of the PartitionDiffMode does not fail the action. If there is no data to process, the following actions are skipped.\nMultiple conditions are evaluated individually and every condition may fail the execution mode (or-logic)" + }, + "selectExpression": { + "type": "string", + "description": "optional expression to define or refine the list of selected output partitions. Define a spark sql expression working with the attributes of[[PartitionDiffModeExpressionData]] returning a list>.\nDefault is to return the originally selected output partitions found in attribute selectedOutputPartitionValues." + }, + "applyPartitionValuesTransform": { + "type": "boolean", + "description": "If true applies the partition values transform of custom transformations on input partition values before comparison with output partition values.\nIf enabled input and output partition columns can be different. Default is to disable the transformation of partition values." + }, + "selectAdditionalInputExpression": { + "type": "string", + "description": "optional expression to refine the list of selected input partitions. Note that primarily output partitions are selected by PartitionDiffMode.\nThe selected output partitions are then transformed back to the input partitions needed to create the selected output partitions. This is one-to-one except if applyPartitionValuesTransform=true.\nAnd sometimes there is a need for additional input data to create the output partitions, e.g. if you aggregate a window of 7 days for every day.\nYou can customize selected input partitions by defining a spark sql expression working with the attributes of[[PartitionDiffModeExpressionData]] returning a list>.\nDefault is to return the originally selected input partitions found in attribute selectedInputPartitionValues." + } + }, + "title": "PartitionDiffMode", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Partition difference execution mode lists partitions on mainInput & mainOutput DataObject and starts loading all missing partitions.\nPartition columns to be used for comparision need to be a common \\'init\\' of input and output partition columns.\nThis mode needs mainInput/Output DataObjects which CanHandlePartitions to list partitions.\nPartition values are passed to following actions for partition columns which they have in common." + }, + "ProcessAllMode": { + "type": "object", + "properties": { + "type": { + "const": "ProcessAllMode" + } + }, + "title": "ProcessAllMode", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "An execution mode which forces processing all data from it\\'s inputs." + }, + "SparkStreamingMode": { + "type": "object", + "properties": { + "type": { + "const": "SparkStreamingMode" + }, + "checkpointLocation": { + "type": "string", + "description": "location for checkpoints of streaming query to keep state" + }, + "triggerType": { + "type": "string", + "description": "define execution interval of Spark streaming query. Possible values are Once (default), ProcessingTime & Continuous. See[[Trigger]] for details.\nNote that this is only applied if SDL is executed in streaming mode. If SDL is executed in normal mode, TriggerType=Once is used always.\nIf triggerType=Once, the action is repeated with Trigger.Once in SDL streaming mode." + }, + "triggerTime": { + "type": "string", + "description": "Time as String in triggerType = ProcessingTime or Continuous. See[[Trigger]] for details." + }, + "inputOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "additional option to apply when reading streaming source. This overwrites options set by the DataObjects." + }, + "outputOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "additional option to apply when writing to streaming sink. This overwrites options set by the DataObjects." + }, + "outputMode": { + "type": "string", + "enum": [ + "Append", + "Complete", + "Update" + ] + } + }, + "title": "SparkStreamingMode", + "required": [ + "type", + "checkpointLocation" + ], + "additionalProperties": false, + "description": "Spark streaming execution mode uses Spark Structured Streaming to incrementally execute data loads and keep track of processed data.\nThis mode needs a DataObject implementing CanCreateStreamingDataFrame and works only with SparkSubFeeds.\nThis mode can be executed synchronously in the DAG by using triggerType=Once, or asynchronously as Streaming Query with triggerType = ProcessingTime or Continuous." + } + }, + "ValidationRule": { + "RowLevelValidationRule": { + "type": "object", + "properties": { + "type": { + "const": "RowLevelValidationRule" + }, + "condition": { + "type": "string", + "description": "an SQL expression defining the condition to be tested. The condition should return true if the condition is satisfied." + }, + "errorMsg": { + "type": "string", + "description": "Optional error msg to be create if the condition fails. Default is to use a text representation of the condition." + } + }, + "title": "RowLevelValidationRule", + "required": [ + "type", + "condition" + ], + "additionalProperties": false, + "description": "Definition for a row level data validation rule." + } + }, + "Connection": { + "DeltaLakeTableConnection": { + "type": "object", + "properties": { + "type": { + "const": "DeltaLakeTableConnection" + }, + "catalog": { + "type": "string", + "description": "optional catalog to be used for this connection" + }, + "db": { + "type": "string", + "description": "hive db" + }, + "pathPrefix": { + "type": "string", + "description": "schema, authority and base path for tables directory on hadoop" + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "checkDeltaLakeSparkOptions": { + "type": "boolean" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "DeltaLakeTableConnection", + "required": [ + "type", + "db", + "pathPrefix" + ], + "additionalProperties": false, + "description": "Connection information for DeltaLake tables" + }, + "HadoopFileConnection": { + "type": "object", + "properties": { + "type": { + "const": "HadoopFileConnection" + }, + "pathPrefix": { + "type": "string", + "description": "schema, authority and base path for accessing files on hadoop" + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "HadoopFileConnection", + "required": [ + "type", + "pathPrefix" + ], + "additionalProperties": false, + "description": "Connection information for files on hadoop" + }, + "HiveTableConnection": { + "type": "object", + "properties": { + "type": { + "const": "HiveTableConnection" + }, + "db": { + "type": "string", + "description": "hive db" + }, + "pathPrefix": { + "type": "string", + "description": "optional schema, authority and base path for tables directory on hadoop." + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "HiveTableConnection", + "required": [ + "type", + "db" + ], + "additionalProperties": false, + "description": "Connection information for hive tables" + }, + "JdbcTableConnection": { + "type": "object", + "properties": { + "type": { + "const": "JdbcTableConnection" + }, + "url": { + "type": "string", + "description": "jdbc connection url" + }, + "driver": { + "type": "string", + "description": "class name of jdbc driver" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "optional authentication information: for now BasicAuthMode is supported." + }, + "db": { + "type": "string", + "description": "jdbc database" + }, + "maxParallelConnections": { + "type": "integer", + "description": "max number of parallel jdbc connections created by an instance of this connection, default is 3\nNote that Spark manages JDBC Connections on its own. This setting only applies to JDBC connection\nused by SDL for validating metadata or pre/postSQL." + }, + "connectionPoolMaxIdleTimeSec": { + "type": "integer", + "description": "timeout to close unused connections in the pool" + }, + "connectionPoolMaxWaitTimeSec": { + "type": "integer", + "description": "timeout when waiting for connection in pool to become available. Default is 600 seconds (10 minutes)." + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + }, + "autoCommit": { + "type": "boolean", + "description": "flag to enable or disable the auto-commit behaviour. When autoCommit is enabled, each database request is executed in its own transaction.\nDefault is autoCommit = false. It is not recommended to enable autoCommit as it will deactivate any transactional behaviour.", + "deprecated": true + }, + "connectionInitSql": { + "type": "string", + "description": "SQL statement to be executed every time a new connection is created, for example to set session parameters" + } + }, + "title": "JdbcTableConnection", + "required": [ + "type", + "url", + "driver" + ], + "additionalProperties": false, + "description": "Connection information for jdbc tables.\nIf authentication is needed, user and password must be provided." + }, + "KafkaConnection": { + "type": "object", + "properties": { + "type": { + "const": "KafkaConnection" + }, + "brokers": { + "type": "string", + "description": "comma separated list of kafka bootstrap server incl. port, e.g. \\\"host1:9092,host2:9092:" + }, + "schemaRegistry": { + "type": "string", + "description": "url of schema registry service, e.g. \\\"https://host2\\\"" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options for the Kafka stream reader (see https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html)" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ] + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "KafkaConnection", + "required": [ + "type", + "brokers" + ], + "additionalProperties": false, + "description": "Connection information for kafka" + }, + "SFtpFileRefConnection": { + "type": "object", + "properties": { + "type": { + "const": "SFtpFileRefConnection" + }, + "host": { + "type": "string", + "description": "sftp host" + }, + "port": { + "type": "integer", + "description": "port of sftp service, default is 22" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "authentication information: for now BasicAuthMode and PublicKeyAuthMode are supported." + }, + "proxy": { + "type": "object", + "properties": { + "host": { + "type": "string", + "description": "proxy host" + }, + "port": { + "type": "integer", + "description": "proxy port" + }, + "proxyType": { + "type": "string", + "description": "Type of proxy: HTTP or SOCKS. Default is HTTP.", + "enum": [ + "DIRECT", + "HTTP", + "SOCKS" + ] + } + }, + "title": "JavaNetProxyConfig", + "required": [ + "host", + "port" + ], + "additionalProperties": false, + "description": "Proxy configuration to create java.net.Proxy instance." + }, + "ignoreHostKeyVerification": { + "type": "boolean", + "description": "do not validate host key if true, default is false" + }, + "maxParallelConnections": { + "type": "integer", + "description": "number of parallel sftp connections created by an instance of this connection" + }, + "connectionPoolMaxIdleTimeSec": { + "type": "integer", + "description": "timeout to close unused connections in the pool" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "SFtpFileRefConnection", + "required": [ + "type", + "host", + "authMode" + ], + "additionalProperties": false, + "description": "SFTP Connection information" + }, + "SnowflakeConnection": { + "type": "object", + "properties": { + "type": { + "const": "SnowflakeConnection" + }, + "url": { + "type": "string", + "description": "snowflake connection url" + }, + "warehouse": { + "type": "string", + "description": "Snowflake namespace" + }, + "database": { + "type": "string", + "description": "Snowflake database" + }, + "role": { + "type": "string", + "description": "Snowflake role" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "optional authentication information: for now BasicAuthMode is supported." + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "SnowflakeConnection", + "required": [ + "type", + "url", + "warehouse", + "database", + "role", + "authMode" + ], + "additionalProperties": false, + "description": "Connection information for Snowflake databases.\nThe connection can be used for SnowflakeTableDataObjects\nIf multiple SnowflakeTableDataObjects share a connection, they share the same Snowpark session" + }, + "SplunkConnection": { + "type": "object", + "properties": { + "type": { + "const": "SplunkConnection" + }, + "host": { + "type": "string", + "description": "" + }, + "port": { + "type": "integer", + "description": "" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "SplunkConnection", + "required": [ + "type", + "host", + "port", + "authMode" + ], + "additionalProperties": false, + "description": "Connection information for splunk queries" + } + }, + "HousekeepingMode": { + "PartitionArchiveCompactionMode": { + "type": "object", + "properties": { + "type": { + "const": "PartitionArchiveCompactionMode" + }, + "archivePartitionExpression": { + "type": "string", + "description": "Expression to define the archive partition for a given partition. Define a spark\nsql expression working with the attributes of[[PartitionExpressionData]] returning archive\npartition values as Map[String,String]. If return value is the same as input elements, partition is not touched,\notherwise all files of the partition are moved to the returned partition definition.\nBe aware that the value of the partition columns changes for these files/records." + }, + "compactPartitionExpression": { + "type": "string", + "description": "Expression to define partitions which should be compacted. Define a spark\nsql expression working with the attributes of[[PartitionExpressionData]] returning a\nboolean = true when this partition should be compacted.\nOnce a partition is compacted, it is marked as compacted and will not be compacted again.\nIt is therefore ok to return true for all partitions which should be compacted, regardless if they have been compacted already." + }, + "description": { + "type": "string" + } + }, + "title": "PartitionArchiveCompactionMode", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Archive and compact old partitions:\nArchive partition reduces the number of partitions in the past by moving older partitions into special \\\"archive partitions\\\".\nCompact partition reduces the number of files in a partition by rewriting them with Spark.\nExample: archive and compact a table with partition layout run_id=\n- archive partitions after 1000 partitions into \\\"archive partition\\\" equal to floor(run_id/1000)\n- compact \\\"archive partition\\\" when full\n\n```\nhousekeepingMode = {\ntype = PartitionArchiveCompactionMode\narchivePartitionExpression = \\\"if( elements[\\'run_id\\'] < runId - 1000, map(\\'run_id\\', elements[\\'run_id\\'] div 1000), elements)\\\"\ncompactPartitionExpression = \\\"elements[\\'run_id\\'] % 1000 = 0 and elements[\\'run_id\\'] <= runId - 2000\\\"\n}\n```" + }, + "PartitionRetentionMode": { + "type": "object", + "properties": { + "type": { + "const": "PartitionRetentionMode" + }, + "retentionCondition": { + "type": "string", + "description": "Condition to decide if a partition should be kept. Define a spark sql expression\nworking with the attributes of[[PartitionExpressionData]] returning a boolean with value true if the partition should be kept." + }, + "description": { + "type": "string" + } + }, + "title": "PartitionRetentionMode", + "required": [ + "type", + "retentionCondition" + ], + "additionalProperties": false, + "description": "Keep partitions while retention condition is fulfilled, delete other partitions.\nExample: cleanup partitions with partition layout dt= after 90 days:\n\n```\nhousekeepingMode = {\ntype = PartitionRetentionMode\nretentionCondition = \\\"datediff(now(), to_date(elements[\\'dt\\'], \\'yyyyMMdd\\')) <= 90\\\"\n}\n```" + } + }, + "DataObject": { + "AccessTableDataObject": { + "type": "object", + "properties": { + "type": { + "const": "AccessTableDataObject" + }, + "path": { + "type": "string" + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "table": { + "$ref": "#/definitions/Others/Table" + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "AccessTableDataObject", + "required": [ + "type", + "path", + "table" + ], + "additionalProperties": false, + "description": "[[DataObject]]of type JDBC / Access.\nProvides access to a Access DB to an Action. The functionality is handled seperately from[[JdbcTableDataObject]] \nto avoid problems with net.ucanaccess.jdbc.UcanaccessDriver" + }, + "ActionsExporterDataObject": { + "type": "object", + "properties": { + "type": { + "const": "ActionsExporterDataObject" + }, + "config": { + "type": "string" + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "ActionsExporterDataObject", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Exports a util[[DataFrame]]that contains properties and metadata extracted from all[[io.smartdatalake.workflow.action.Action]]s\nthat are registered in the current[[InstanceRegistry]].\n\nAlternatively, it can export the properties and metadata of all[[io.smartdatalake.workflow.action.Action]]s defined in config files. For this, the\nconfiguration \\\"config\\\" has to be set to the location of the config.\n\nExample:\n\n```\ndataObjects = {\n...\nactions-exporter {\ntype = ActionsExporterDataObject\nconfig = path/to/myconfiguration.conf\n}\n...\n}\n```\n\n\nThe config value can point to a configuration file or a directory containing configuration files.\n\nSEE: Refer to[[ConfigLoader.loadConfigFromFilesystem()]] for details about the configuration loading." + }, + "AirbyteDataObject": { + "type": "object", + "properties": { + "type": { + "const": "AirbyteDataObject" + }, + "config": { + "type": "string", + "description": "Configuration for the source", + "existingJavaType": "com.typesafe.config.Config" + }, + "streamName": { + "type": "string", + "description": "The stream name to read. Must match an entry of the catalog of the source." + }, + "cmd": { + "oneOf": [ + { + "$ref": "#/definitions/ParsableScriptDef/CmdScript" + }, + { + "$ref": "#/definitions/ParsableScriptDef/DockerRunScript" + } + ], + "description": "command to launch airbyte connector. Normally this is of type[[DockerRunScript]] ." + }, + "incrementalCursorFields": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Some sources need a specification of the cursor field for incremental mode" + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "AirbyteDataObject", + "required": [ + "type", + "config", + "streamName", + "cmd" + ], + "additionalProperties": false, + "description": "Limitations: Connectors have only access to locally mounted directories" + }, + "AvroFileDataObject": { + "type": "object", + "properties": { + "type": { + "const": "AvroFileDataObject" + }, + "path": { + "type": "string", + "description": "The root path of the files that are handled by this DataObject.\nFor most DataObjects this can be a directory or a specific file." + }, + "partitions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Definition of partition columns" + }, + "avroOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Settings for the underlying[[org.apache.spark.sql.DataFrameReader]]and\n[[org.apache.spark.sql.DataFrameWriter]] ." + }, + "schema": { + "type": "string", + "description": "An optional schema for the spark data frame to be validated on read and write. Note: Existing Avro files\ncontain a source schema. Therefore, this schema is ignored when reading from existing Avro files.\nAs this corresponds to the schema on write, it must not include the optional filenameColumn on read.\nDefine the schema by using one of the schema providers DDL, jsonSchemaFile, avroSchemaFile, xsdFile or caseClassName.\nThe schema provider and its configuration value must be provided in the format #.\nA DDL-formatted string is a comma separated list of field definitions, e.g., a INT, b STRING." + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "saveMode": { + "type": "string", + "description": "Overwrite or Append new data.\nWhen writing partitioned data, this applies only to partitions concerned.", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "sparkRepartition": { + "type": "object", + "properties": { + "numberOfTasksPerPartition": { + "type": "integer", + "description": "Number of Spark tasks to create per partition before writing to DataObject by repartitioning the DataFrame.\nThis controls how many files are created in each Hadoop partition." + }, + "keyCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional key columns to distribute records over Spark tasks inside a Hadoop partition.\nIf DataObject has Hadoop partitions defined, keyCols must be defined." + }, + "sortCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional columns to sort records inside files created." + }, + "filename": { + "type": "string", + "description": "Option filename to rename target file(s). If numberOfTasksPerPartition is greater than 1,\nmultiple files can exist in a directory and a number is inserted into the filename after the first \\'.\\'.\nExample: filename=data.csv -> files created are data.1.csv, data.2.csv, ..." + } + }, + "title": "SparkRepartitionDef", + "required": [ + "numberOfTasksPerPartition" + ], + "additionalProperties": false, + "description": "This controls repartitioning of the DataFrame before writing with Spark to Hadoop.\n\nWhen writing multiple partitions of a partitioned DataObject, the number of spark tasks created is equal to numberOfTasksPerPartition\nmultiplied with the number of partitions to write. To spread the records of a partition only over numberOfTasksPerPartition spark tasks,\nkeyCols must be given which are used to derive a task number inside the partition (hashvalue(keyCols) modulo numberOfTasksPerPartition).\n\nWhen writing to an unpartitioned DataObject or only one partition of a partitioned DataObject, the number of spark tasks created is equal\nto numberOfTasksPerPartition. Optional keyCols can be used to keep corresponding records together in the same task/file." + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "connectionId": { + "type": "string", + "description": "Return the connection id.\n\nConnection defines path prefix (scheme, authority, base path) and ACL\\'s in central location." + }, + "filenameColumn": { + "type": "string", + "description": "The name of the (optional) additional column containing the source filename" + }, + "expectedPartitionsCondition": { + "type": "string", + "description": "Optional definition of partitions expected to exist.\nDefine a Spark SQL expression that is evaluated against a[[PartitionValues]] instance and returns true or false\nDefault is to expect all partitions to exist." + }, + "housekeepingMode": { + "oneOf": [ + { + "$ref": "#/definitions/HousekeepingMode/PartitionArchiveCompactionMode" + }, + { + "$ref": "#/definitions/HousekeepingMode/PartitionRetentionMode" + } + ], + "description": "Optional definition of a housekeeping mode applied after every write. E.g. it can be used to cleanup, archive and compact partitions.\nSee HousekeepingMode for available implementations. Default is None." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "AvroFileDataObject", + "required": [ + "type", + "path" + ], + "additionalProperties": false, + "description": "A[[io.smartdatalake.workflow.dataobject.DataObject]]backed by an Avro data source.\n\nIt manages read and write access and configurations required for[[io.smartdatalake.workflow.action.Action]]s to\nwork on Avro formatted files.\n\nReading and writing details are delegated to Apache Spark[[org.apache.spark.sql.DataFrameReader]]\nand[[org.apache.spark.sql.DataFrameWriter]]respectively. The reader and writer implementations are provided by\nthe[[https://github.com/databricks/spark-avro databricks spark-avro]] project.\n\nSEE: [[org.apache.spark.sql.DataFrameReader]]\n\nSEE: [[org.apache.spark.sql.DataFrameWriter]]" + }, + "CsvFileDataObject": { + "type": "object", + "properties": { + "type": { + "const": "CsvFileDataObject" + }, + "path": { + "type": "string", + "description": "The root path of the files that are handled by this DataObject.\nFor most DataObjects this can be a directory or a specific file." + }, + "csvOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Settings for the underlying[[org.apache.spark.sql.DataFrameReader]]and[[org.apache.spark.sql.DataFrameWriter]] ." + }, + "partitions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Definition of partition columns" + }, + "schema": { + "type": "string", + "description": "An optional data object schema. If defined, any automatic schema inference is avoided.\nAs this corresponds to the schema on write, it must not include the optional filenameColumn on read.\nDefine the schema by using one of the schema providers DDL, jsonSchemaFile, xsdFile or caseClassName.\nThe schema provider and its configuration value must be provided in the format #.\nA DDL-formatted string is a comma separated list of field definitions, e.g., a INT, b STRING." + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "dateColumnType": { + "type": "string", + "description": "Specifies the string format used for writing date typed data.", + "enum": [ + "Default ", + "String ", + "Date " + ] + }, + "saveMode": { + "type": "string", + "description": "Overwrite or Append new data.\nWhen writing partitioned data, this applies only to partitions concerned.", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "sparkRepartition": { + "type": "object", + "properties": { + "numberOfTasksPerPartition": { + "type": "integer", + "description": "Number of Spark tasks to create per partition before writing to DataObject by repartitioning the DataFrame.\nThis controls how many files are created in each Hadoop partition." + }, + "keyCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional key columns to distribute records over Spark tasks inside a Hadoop partition.\nIf DataObject has Hadoop partitions defined, keyCols must be defined." + }, + "sortCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional columns to sort records inside files created." + }, + "filename": { + "type": "string", + "description": "Option filename to rename target file(s). If numberOfTasksPerPartition is greater than 1,\nmultiple files can exist in a directory and a number is inserted into the filename after the first \\'.\\'.\nExample: filename=data.csv -> files created are data.1.csv, data.2.csv, ..." + } + }, + "title": "SparkRepartitionDef", + "required": [ + "numberOfTasksPerPartition" + ], + "additionalProperties": false, + "description": "This controls repartitioning of the DataFrame before writing with Spark to Hadoop.\n\nWhen writing multiple partitions of a partitioned DataObject, the number of spark tasks created is equal to numberOfTasksPerPartition\nmultiplied with the number of partitions to write. To spread the records of a partition only over numberOfTasksPerPartition spark tasks,\nkeyCols must be given which are used to derive a task number inside the partition (hashvalue(keyCols) modulo numberOfTasksPerPartition).\n\nWhen writing to an unpartitioned DataObject or only one partition of a partitioned DataObject, the number of spark tasks created is equal\nto numberOfTasksPerPartition. Optional keyCols can be used to keep corresponding records together in the same task/file." + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "connectionId": { + "type": "string", + "description": "Return the connection id.\n\nConnection defines path prefix (scheme, authority, base path) and ACL\\'s in central location." + }, + "filenameColumn": { + "type": "string", + "description": "The name of the (optional) additional column containing the source filename" + }, + "expectedPartitionsCondition": { + "type": "string", + "description": "Optional definition of partitions expected to exist.\nDefine a Spark SQL expression that is evaluated against a[[PartitionValues]] instance and returns true or false\nDefault is to expect all partitions to exist." + }, + "housekeepingMode": { + "oneOf": [ + { + "$ref": "#/definitions/HousekeepingMode/PartitionArchiveCompactionMode" + }, + { + "$ref": "#/definitions/HousekeepingMode/PartitionRetentionMode" + } + ], + "description": "Optional definition of a housekeeping mode applied after every write. E.g. it can be used to cleanup, archive and compact partitions.\nSee HousekeepingMode for available implementations. Default is None." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "CsvFileDataObject", + "required": [ + "type", + "path" + ], + "additionalProperties": false, + "description": "A[[DataObject]]backed by a comma-separated value (CSV) data source.\n\nIt manages read and write access and configurations required for[[io.smartdatalake.workflow.action.Action]]s to\nwork on CSV formatted files.\n\nCSV reading and writing details are delegated to Apache Spark[[org.apache.spark.sql.DataFrameReader]]\nand[[org.apache.spark.sql.DataFrameWriter]]respectively.\n\nRead Schema specifications:\n\nIf a data object schema is not defined via the`schema`attribute (default) and`inferSchema`option is\ndisabled (default) in`csvOptions`, then all column types are set to String and the first row of the CSV file is read\nto determine the column names and the number of fields.\n\nIf the`header`option is disabled (default) in`csvOptions`, then the header is defined as \\\"_c#\\\" for each column\nwhere \\\"#\\\" is the column index.\nOtherwise the first row of the CSV file is not included in the DataFrame content and its entries\nare used as the column names for the schema.\n\nIf a data object schema is not defined via the`schema`attribute and`inferSchema`is enabled in`csvOptions`, then\nthe`samplingRatio`(default: 1.0) option in`csvOptions` is used to extract a sample from the CSV file in order to\ndetermine the input schema automatically.\n\nNOTE: This data object sets the following default values for`csvOptions`: delimiter = \\\"|\\\", quote = null, header = false, and inferSchema = false.\nAll other`csvOption` default to the values defined by Apache Spark.\n\nSEE: [[org.apache.spark.sql.DataFrameReader]]\n\nSEE: [[org.apache.spark.sql.DataFrameWriter]]" + }, + "CustomDfDataObject": { + "type": "object", + "properties": { + "type": { + "const": "CustomDfDataObject" + }, + "creator": { + "type": "object", + "properties": { + "className": { + "type": "string", + "description": "Optional class name implementing trait[[CustomDfCreator]]" + }, + "scalaFile": { + "type": "string", + "description": "Optional file where scala code for creator is loaded from. The scala code in the file needs to be a function of type[[fnExecType]] ." + }, + "scalaCode": { + "type": "string", + "description": "Optional scala code for creator. The scala code needs to be a function of type[[fnExecType]] ." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the creator" + } + }, + "title": "CustomDfCreatorConfig", + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame creator as part of[[CustomDfDataObject]]\nDefine a exec function which receives a map of options and returns a DataFrame to be used as input.\nOptionally define a schema function to return a StructType used as schema in init-phase.\nSee also trait[[CustomDfCreator]] .\n\nNote that for now implementing CustomDfCreator.schema method is only possible with className configuration attribute." + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "CustomDfDataObject", + "required": [ + "type", + "creator" + ], + "additionalProperties": false, + "description": "Generic[[DataObject]] containing a config object.\nE.g. used to implement a CustomAction that reads a Webservice." + }, + "CustomFileDataObject": { + "type": "object", + "properties": { + "type": { + "const": "CustomFileDataObject" + }, + "creator": { + "type": "object", + "properties": { + "className": { + "type": "string" + }, + "scalaFile": { + "type": "string" + }, + "scalaCode": { + "type": "string" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + } + } + }, + "title": "CustomFileCreatorConfig", + "additionalProperties": false + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "CustomFileDataObject", + "required": [ + "type", + "creator" + ], + "additionalProperties": false + }, + "DataObjectsExporterDataObject": { + "type": "object", + "properties": { + "type": { + "const": "DataObjectsExporterDataObject" + }, + "config": { + "type": "string" + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "DataObjectsExporterDataObject", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Exports a util[[DataFrame]]that contains properties and metadata extracted from all[[DataObject]]s\nthat are registered in the current[[InstanceRegistry]].\n\nAlternatively, it can export the properties and metadata of all[[DataObject]]s defined in config files. For this, the\nconfiguration \\\"config\\\" has to be set to the location of the config.\n\nExample:\n\n```\ndataObjects = {\n...\ndataobject-exporter {\ntype = DataObjectsExporterDataObject\nconfig = path/to/myconfiguration.conf\n}\n...\n}\n```\n\n\nThe config value can point to a configuration file or a directory containing configuration files.\n\nSEE: Refer to[[ConfigLoader.loadConfigFromFilesystem()]] for details about the configuration loading." + }, + "DeltaLakeTableDataObject": { + "type": "object", + "properties": { + "type": { + "const": "DeltaLakeTableDataObject" + }, + "path": { + "type": "string", + "description": "hadoop directory for this table. If it doesn\\'t contain scheme and authority, the connections pathPrefix is applied.\nIf pathPrefix is not defined or doesn\\'t define scheme and authority, default schema and authority is applied." + }, + "partitions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "partition columns for this data object" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options for Delta Lake tables see:[[https://docs.delta.io/latest/delta-batch.html]]and[[org.apache.spark.sql.delta.DeltaOptions]]" + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that this DataObject must have to pass schema validation on reading and writing.\nDefine schema by using a DDL-formatted string, which is a comma separated list of field definitions, e.g., a INT, b STRING." + }, + "table": { + "$ref": "#/definitions/Others/Table" + }, + "constraints": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "name of the constraint" + }, + "description": { + "type": "string", + "description": "optional detailed description of the constraint" + }, + "expression": { + "type": "string", + "description": "SQL expression to evaluate on every row. The expressions return value should be a boolean.\nIf it evaluates to true the constraint is validated successfully, otherwise it will throw an exception." + }, + "errorMsgCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional list of column names to add to error message.\nNote that primary key columns are always included.\nIf there is no primary key defined, by default all columns with simple datatype are included in the error message." + } + }, + "title": "Constraint", + "required": [ + "name", + "expression" + ], + "additionalProperties": false, + "description": "Definition of row-level constraint to validate." + }, + "description": "List of row-level[[Constraint]] s to enforce when writing to this data object." + }, + "expectations": { + "type": "array", + "items": { + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "const": "AvgCountPerPartitionExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'> 100000\\\".\nIf no expectation is defined, the result value is just recorded in metrics." + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "AvgCountPerPartitionExpectation", + "required": [ + "name", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on the average number of records per partitions.\n\nNote that the scope for evaluating this expectation is fixed to Job." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLQueryExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "code": { + "type": "string", + "description": "a SQL query returning a single row. All column will be added as metrics.\nIf there are more than one column, there has to be one column with the same name as this expectation. This column will be used to compare against a potential condition of the expectation.\nThe special token %{inputViewName} must be used to insert the temporary view name used to provide the DataFrame to the query." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'= 0\\\".\nTogether with the result of the aggExpression evaluation on the left side, it forms the condition to validate the expectation.\nIf no expectation is defined, the aggExpression evaluation result is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLQueryExpectation", + "required": [ + "name", + "code", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on a SQL query to be evaluate on dataset-level.\nThe SQL query will be evaluated in a separate Spark job against the DataFrame." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "aggExpression": { + "type": "string", + "description": "SQL aggregate expression to evaluate on dataset, e.g. count(*)." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'= 0\\\".\nTogether with the result of the aggExpression evaluation on the left side, it forms the condition to validate the expectation.\nIf no expectation is defined, the aggExpression evaluation result is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLExpectation", + "required": [ + "name", + "aggExpression", + "type" + ], + "additionalProperties": false, + "description": "Definition of expectation based on a SQL aggregate expression to evaluate on dataset-level." + }, + { + "type": "object", + "properties": { + "type": { + "const": "ExpectationDefaultImpl" + } + }, + "title": "ExpectationDefaultImpl", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Default implementation for getValidationErrorColumn for metric of type`any` ." + }, + { + "type": "object", + "properties": { + "type": { + "const": "CountExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'> 100000\\\".\nIf no expectation is defined, the result value is is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "CountExpectation", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on the number of records." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLFractionExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "countConditionExpression": { + "type": "string", + "description": "SQL expression returning a boolean to match the rows to count for the fraction." + }, + "globalConditionExpression": { + "type": "string", + "description": "SQL expression returning a boolean used as global filter, e.g. fraction row count and total row count are filtered with global filter before counting." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected percentage for validation, e.g. \\'= 0.9\\\".\nIf no expectation is defined, the result value is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLFractionExpectation", + "required": [ + "name", + "countConditionExpression", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on counting how many rows match an expression vs the number of all rows.\nThe fraction of these two counts is compared against a given expectation." + } + ] + }, + "description": "List of[[Expectation]] s to enforce when writing to this data object. Expectations are checks based on aggregates over all rows of a dataset." + }, + "saveMode": { + "type": "string", + "description": "[[SDLSaveMode]] to use when writing files, default is \\\"overwrite\\\". Overwrite, Append and Merge are supported for now.", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "allowSchemaEvolution": { + "type": "boolean", + "description": "If set to true schema evolution will automatically occur when writing to this DataObject with different schema, otherwise SDL will stop with error." + }, + "retentionPeriod": { + "type": "integer", + "description": "Optional delta lake retention threshold in hours. Files required by the table for reading versions younger than retentionPeriod will be preserved and the rest of them will be deleted." + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "connectionId": { + "type": "string", + "description": "optional id of[[io.smartdatalake.workflow.connection.HiveTableConnection]]" + }, + "expectedPartitionsCondition": { + "type": "string", + "description": "Optional definition of partitions expected to exist.\nDefine a Spark SQL expression that is evaluated against a[[PartitionValues]] instance and returns true or false\nDefault is to expect all partitions to exist." + }, + "housekeepingMode": { + "oneOf": [ + { + "$ref": "#/definitions/HousekeepingMode/PartitionArchiveCompactionMode" + }, + { + "$ref": "#/definitions/HousekeepingMode/PartitionRetentionMode" + } + ], + "description": "Optional definition of a housekeeping mode applied after every write. E.g. it can be used to cleanup, archive and compact partitions.\nSee HousekeepingMode for available implementations. Default is None." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "DeltaLakeTableDataObject", + "required": [ + "type", + "table" + ], + "additionalProperties": false, + "description": "[[DataObject]]of type DeltaLakeTableDataObject.\nProvides details to access Tables in delta format to an Action.\n\nDelta format maintains a transaction log in a separate _delta_log subfolder.\nThe schema is registered in Metastore by DeltaLakeTableDataObject.\n\nThe following anomalies might occur:\n- table is registered in metastore but path does not exist -> table is dropped from metastore\n- table is registered in metastore but path is empty -> error is thrown. Delete the path to clean up\n- table is registered and path contains parquet files, but _delta_log subfolder is missing -> path is converted to delta format\n- table is not registered but path contains parquet files and _delta_log subfolder -> Table is registered\n- table is not registered but path contains parquet files without _delta_log subfolder -> path is converted to delta format and table is registered\n- table is not registered and path does not exists -> table is created on write\n\n* DeltaLakeTableDataObject implements\n-[[CanMergeDataFrame]]by using DeltaTable.merge API.\n-[[CanEvolveSchema]] by using mergeSchema option.\n- Overwriting partitions is implemented by replaceWhere option in one transaction." + }, + "ExcelFileDataObject": { + "type": "object", + "properties": { + "type": { + "const": "ExcelFileDataObject" + }, + "path": { + "type": "string", + "description": "The root path of the files that are handled by this DataObject.\nFor most DataObjects this can be a directory or a specific file." + }, + "excelOptions": { + "type": "object", + "properties": { + "sheetName": { + "type": "string", + "description": "Optional name of the Excel Sheet to read from/write to." + }, + "numLinesToSkip": { + "type": "integer", + "description": "Optional number of rows in the excel spreadsheet to skip before any data is read.\nThis option must not be set for writing." + }, + "startColumn": { + "type": "string", + "description": "Optional first column in the specified Excel Sheet to read from (as string, e.g B).\nThis option must not be set for writing." + }, + "endColumn": { + "type": "string", + "description": "Optional last column in the specified Excel Sheet to read from (as string, e.g. F)." + }, + "rowLimit": { + "type": "integer", + "description": "Optional limit of the number of rows being returned on read.\nThis is applied after`numLinesToSkip` ." + }, + "useHeader": { + "type": "boolean", + "description": "If`true` , the first row of the excel sheet specifies the column names (default: true)." + }, + "treatEmptyValuesAsNulls": { + "type": "boolean", + "description": "Empty cells are parsed as`null` values (default: true)." + }, + "inferSchema": { + "type": "boolean", + "description": "Infer the schema of the excel sheet automatically (default: true)." + }, + "timestampFormat": { + "type": "string", + "description": "A format string specifying the format to use when writing timestamps (default: dd-MM-yyyy HH:mm:ss)." + }, + "dateFormat": { + "type": "string", + "description": "A format string specifying the format to use when writing dates." + }, + "maxRowsInMemory": { + "type": "integer", + "description": "The number of rows that are stored in memory.\nIf set, a streaming reader is used which can help with big files." + }, + "excerptSize": { + "type": "integer", + "description": "Sample size for schema inference." + } + }, + "title": "ExcelOptions", + "additionalProperties": false, + "description": "Options passed to[[org.apache.spark.sql.DataFrameReader]]and[[org.apache.spark.sql.DataFrameWriter]] for\nreading and writing Microsoft Excel files. Excel support is provided by the spark-excel project (see link below).\n\nSEE: [[https://github.com/crealytics/spark-excel]]" + }, + "partitions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Definition of partition columns" + }, + "schema": { + "type": "string", + "description": "An optional data object schema. If defined, any automatic schema inference is avoided.\nAs this corresponds to the schema on write, it must not include the optional filenameColumn on read.\nDefine the schema by using one of the schema providers DDL, jsonSchemaFile, xsdFile or caseClassName.\nThe schema provider and its configuration value must be provided in the format #.\nA DDL-formatted string is a comma separated list of field definitions, e.g., a INT, b STRING." + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "saveMode": { + "type": "string", + "description": "Overwrite or Append new data.\nWhen writing partitioned data, this applies only to partitions concerned.", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "sparkRepartition": { + "type": "object", + "properties": { + "numberOfTasksPerPartition": { + "type": "integer", + "description": "Number of Spark tasks to create per partition before writing to DataObject by repartitioning the DataFrame.\nThis controls how many files are created in each Hadoop partition." + }, + "keyCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional key columns to distribute records over Spark tasks inside a Hadoop partition.\nIf DataObject has Hadoop partitions defined, keyCols must be defined." + }, + "sortCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional columns to sort records inside files created." + }, + "filename": { + "type": "string", + "description": "Option filename to rename target file(s). If numberOfTasksPerPartition is greater than 1,\nmultiple files can exist in a directory and a number is inserted into the filename after the first \\'.\\'.\nExample: filename=data.csv -> files created are data.1.csv, data.2.csv, ..." + } + }, + "title": "SparkRepartitionDef", + "required": [ + "numberOfTasksPerPartition" + ], + "additionalProperties": false, + "description": "This controls repartitioning of the DataFrame before writing with Spark to Hadoop.\n\nWhen writing multiple partitions of a partitioned DataObject, the number of spark tasks created is equal to numberOfTasksPerPartition\nmultiplied with the number of partitions to write. To spread the records of a partition only over numberOfTasksPerPartition spark tasks,\nkeyCols must be given which are used to derive a task number inside the partition (hashvalue(keyCols) modulo numberOfTasksPerPartition).\n\nWhen writing to an unpartitioned DataObject or only one partition of a partitioned DataObject, the number of spark tasks created is equal\nto numberOfTasksPerPartition. Optional keyCols can be used to keep corresponding records together in the same task/file." + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "connectionId": { + "type": "string", + "description": "Return the connection id.\n\nConnection defines path prefix (scheme, authority, base path) and ACL\\'s in central location." + }, + "filenameColumn": { + "type": "string", + "description": "The name of the (optional) additional column containing the source filename" + }, + "expectedPartitionsCondition": { + "type": "string", + "description": "Optional definition of partitions expected to exist.\nDefine a Spark SQL expression that is evaluated against a[[PartitionValues]] instance and returns true or false\nDefault is to expect all partitions to exist." + }, + "housekeepingMode": { + "oneOf": [ + { + "$ref": "#/definitions/HousekeepingMode/PartitionArchiveCompactionMode" + }, + { + "$ref": "#/definitions/HousekeepingMode/PartitionRetentionMode" + } + ], + "description": "Optional definition of a housekeeping mode applied after every write. E.g. it can be used to cleanup, archive and compact partitions.\nSee HousekeepingMode for available implementations. Default is None." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "ExcelFileDataObject", + "required": [ + "type", + "path" + ], + "additionalProperties": false, + "description": "A[[DataObject]]backed by an Microsoft Excel data source.\n\nIt manages read and write access and configurations required for[[io.smartdatalake.workflow.action.Action]]s to\nwork on Microsoft Excel (.xslx) formatted files.\n\nReading and writing details are delegated to Apache Spark[[org.apache.spark.sql.DataFrameReader]]\nand[[org.apache.spark.sql.DataFrameWriter]]respectively. The reader and writer implementation is provided by the\n[[https://github.com/crealytics/spark-excel Crealytics spark-excel]]project.\n\nRead Schema:\n\nWhen`useHeader`is set to true (default), the reader will use the first row of the Excel sheet as column names for\nthe schema and not include the first row as data values. Otherwise the column names are taken from the schema.\nIf the schema is not provided or inferred, then each column name is defined as \\\"_c#\\\" where \\\"#\\\" is the column index.\n\nWhen a data object schema is provided, it is used as the schema for the DataFrame. Otherwise if`inferSchema`is\nenabled (default), then the data types of the columns are inferred based on the first`excerptSize`rows\n(excluding the first).\nWhen no schema is provided and`inferSchema` is disabled, all columns are assumed to be of string type." + }, + "HiveTableDataObject": { + "type": "object", + "properties": { + "type": { + "const": "HiveTableDataObject" + }, + "path": { + "type": "string", + "description": "hadoop directory for this table. If it doesn\\'t contain scheme and authority, the connections pathPrefix is applied.\nIf pathPrefix is not defined or doesn\\'t define scheme and authority, default schema and authority is applied.\nIf DataObject is only used for reading or if the HiveTable already exist, the path can be omitted.\nIf the HiveTable already exists but with a different path, a warning is issued" + }, + "partitions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "partition columns for this data object" + }, + "analyzeTableAfterWrite": { + "type": "boolean", + "description": "enable compute statistics after writing data (default=false)" + }, + "dateColumnType": { + "type": "string", + "description": "type of date column", + "enum": [ + "Default ", + "String ", + "Date " + ] + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that this DataObject must have to pass schema validation on reading and writing.\nDefine schema by using a DDL-formatted string, which is a comma separated list of field definitions, e.g., a INT, b STRING." + }, + "table": { + "$ref": "#/definitions/Others/Table" + }, + "constraints": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "name of the constraint" + }, + "description": { + "type": "string", + "description": "optional detailed description of the constraint" + }, + "expression": { + "type": "string", + "description": "SQL expression to evaluate on every row. The expressions return value should be a boolean.\nIf it evaluates to true the constraint is validated successfully, otherwise it will throw an exception." + }, + "errorMsgCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional list of column names to add to error message.\nNote that primary key columns are always included.\nIf there is no primary key defined, by default all columns with simple datatype are included in the error message." + } + }, + "title": "Constraint", + "required": [ + "name", + "expression" + ], + "additionalProperties": false, + "description": "Definition of row-level constraint to validate." + }, + "description": "List of row-level[[Constraint]] s to enforce when writing to this data object." + }, + "expectations": { + "type": "array", + "items": { + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "const": "AvgCountPerPartitionExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'> 100000\\\".\nIf no expectation is defined, the result value is just recorded in metrics." + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "AvgCountPerPartitionExpectation", + "required": [ + "name", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on the average number of records per partitions.\n\nNote that the scope for evaluating this expectation is fixed to Job." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLQueryExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "code": { + "type": "string", + "description": "a SQL query returning a single row. All column will be added as metrics.\nIf there are more than one column, there has to be one column with the same name as this expectation. This column will be used to compare against a potential condition of the expectation.\nThe special token %{inputViewName} must be used to insert the temporary view name used to provide the DataFrame to the query." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'= 0\\\".\nTogether with the result of the aggExpression evaluation on the left side, it forms the condition to validate the expectation.\nIf no expectation is defined, the aggExpression evaluation result is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLQueryExpectation", + "required": [ + "name", + "code", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on a SQL query to be evaluate on dataset-level.\nThe SQL query will be evaluated in a separate Spark job against the DataFrame." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "aggExpression": { + "type": "string", + "description": "SQL aggregate expression to evaluate on dataset, e.g. count(*)." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'= 0\\\".\nTogether with the result of the aggExpression evaluation on the left side, it forms the condition to validate the expectation.\nIf no expectation is defined, the aggExpression evaluation result is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLExpectation", + "required": [ + "name", + "aggExpression", + "type" + ], + "additionalProperties": false, + "description": "Definition of expectation based on a SQL aggregate expression to evaluate on dataset-level." + }, + { + "type": "object", + "properties": { + "type": { + "const": "ExpectationDefaultImpl" + } + }, + "title": "ExpectationDefaultImpl", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Default implementation for getValidationErrorColumn for metric of type`any` ." + }, + { + "type": "object", + "properties": { + "type": { + "const": "CountExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'> 100000\\\".\nIf no expectation is defined, the result value is is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "CountExpectation", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on the number of records." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLFractionExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "countConditionExpression": { + "type": "string", + "description": "SQL expression returning a boolean to match the rows to count for the fraction." + }, + "globalConditionExpression": { + "type": "string", + "description": "SQL expression returning a boolean used as global filter, e.g. fraction row count and total row count are filtered with global filter before counting." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected percentage for validation, e.g. \\'= 0.9\\\".\nIf no expectation is defined, the result value is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLFractionExpectation", + "required": [ + "name", + "countConditionExpression", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on counting how many rows match an expression vs the number of all rows.\nThe fraction of these two counts is compared against a given expectation." + } + ] + }, + "description": "List of[[Expectation]] s to enforce when writing to this data object. Expectations are checks based on aggregates over all rows of a dataset." + }, + "numInitialHdfsPartitions": { + "type": "integer", + "description": "number of files created when writing into an empty table (otherwise the number will be derived from the existing data)" + }, + "saveMode": { + "type": "string", + "description": "spark[[SaveMode]] to use when writing files, default is \\\"overwrite\\\"", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "connectionId": { + "type": "string", + "description": "optional id of[[io.smartdatalake.workflow.connection.HiveTableConnection]]" + }, + "expectedPartitionsCondition": { + "type": "string", + "description": "Optional definition of partitions expected to exist.\nDefine a Spark SQL expression that is evaluated against a[[PartitionValues]] instance and returns true or false\nDefault is to expect all partitions to exist." + }, + "housekeepingMode": { + "oneOf": [ + { + "$ref": "#/definitions/HousekeepingMode/PartitionArchiveCompactionMode" + }, + { + "$ref": "#/definitions/HousekeepingMode/PartitionRetentionMode" + } + ], + "description": "Optional definition of a housekeeping mode applied after every write. E.g. it can be used to cleanup, archive and compact partitions.\nSee HousekeepingMode for available implementations. Default is None." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "HiveTableDataObject", + "required": [ + "type", + "table" + ], + "additionalProperties": false, + "description": "[[DataObject]] of type Hive.\nProvides details to access Hive tables to an Action" + }, + "JdbcTableDataObject": { + "type": "object", + "properties": { + "type": { + "const": "JdbcTableDataObject" + }, + "createSql": { + "type": "string", + "description": "DDL-statement to be executed in prepare phase, using output jdbc connection.\nNote that it is also possible to let Spark create the table in Init-phase. See jdbcOptions to customize column data types for auto-created DDL-statement." + }, + "preReadSql": { + "type": "string", + "description": "SQL-statement to be executed in exec phase before reading input table, using input jdbc connection.\nUse tokens with syntax %{} to substitute with values from[[DefaultExpressionData]] ." + }, + "postReadSql": { + "type": "string", + "description": "SQL-statement to be executed in exec phase after reading input table and before action is finished, using input jdbc connection\nUse tokens with syntax %{} to substitute with values from[[DefaultExpressionData]] ." + }, + "preWriteSql": { + "type": "string", + "description": "SQL-statement to be executed in exec phase before writing output table, using output jdbc connection\nUse tokens with syntax %{} to substitute with values from[[DefaultExpressionData]] ." + }, + "postWriteSql": { + "type": "string", + "description": "SQL-statement to be executed in exec phase after writing output table, using output jdbc connection\nUse tokens with syntax %{} to substitute with values from[[DefaultExpressionData]] ." + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that this DataObject must have to pass schema validation on reading and writing.\nDefine schema by using a DDL-formatted string, which is a comma separated list of field definitions, e.g., a INT, b STRING." + }, + "table": { + "$ref": "#/definitions/Others/Table" + }, + "constraints": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "name of the constraint" + }, + "description": { + "type": "string", + "description": "optional detailed description of the constraint" + }, + "expression": { + "type": "string", + "description": "SQL expression to evaluate on every row. The expressions return value should be a boolean.\nIf it evaluates to true the constraint is validated successfully, otherwise it will throw an exception." + }, + "errorMsgCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional list of column names to add to error message.\nNote that primary key columns are always included.\nIf there is no primary key defined, by default all columns with simple datatype are included in the error message." + } + }, + "title": "Constraint", + "required": [ + "name", + "expression" + ], + "additionalProperties": false, + "description": "Definition of row-level constraint to validate." + }, + "description": "List of row-level[[Constraint]] s to enforce when writing to this data object." + }, + "expectations": { + "type": "array", + "items": { + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "const": "AvgCountPerPartitionExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'> 100000\\\".\nIf no expectation is defined, the result value is just recorded in metrics." + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "AvgCountPerPartitionExpectation", + "required": [ + "name", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on the average number of records per partitions.\n\nNote that the scope for evaluating this expectation is fixed to Job." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLQueryExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "code": { + "type": "string", + "description": "a SQL query returning a single row. All column will be added as metrics.\nIf there are more than one column, there has to be one column with the same name as this expectation. This column will be used to compare against a potential condition of the expectation.\nThe special token %{inputViewName} must be used to insert the temporary view name used to provide the DataFrame to the query." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'= 0\\\".\nTogether with the result of the aggExpression evaluation on the left side, it forms the condition to validate the expectation.\nIf no expectation is defined, the aggExpression evaluation result is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLQueryExpectation", + "required": [ + "name", + "code", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on a SQL query to be evaluate on dataset-level.\nThe SQL query will be evaluated in a separate Spark job against the DataFrame." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "aggExpression": { + "type": "string", + "description": "SQL aggregate expression to evaluate on dataset, e.g. count(*)." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'= 0\\\".\nTogether with the result of the aggExpression evaluation on the left side, it forms the condition to validate the expectation.\nIf no expectation is defined, the aggExpression evaluation result is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLExpectation", + "required": [ + "name", + "aggExpression", + "type" + ], + "additionalProperties": false, + "description": "Definition of expectation based on a SQL aggregate expression to evaluate on dataset-level." + }, + { + "type": "object", + "properties": { + "type": { + "const": "ExpectationDefaultImpl" + } + }, + "title": "ExpectationDefaultImpl", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Default implementation for getValidationErrorColumn for metric of type`any` ." + }, + { + "type": "object", + "properties": { + "type": { + "const": "CountExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'> 100000\\\".\nIf no expectation is defined, the result value is is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "CountExpectation", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on the number of records." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLFractionExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "countConditionExpression": { + "type": "string", + "description": "SQL expression returning a boolean to match the rows to count for the fraction." + }, + "globalConditionExpression": { + "type": "string", + "description": "SQL expression returning a boolean used as global filter, e.g. fraction row count and total row count are filtered with global filter before counting." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected percentage for validation, e.g. \\'= 0.9\\\".\nIf no expectation is defined, the result value is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLFractionExpectation", + "required": [ + "name", + "countConditionExpression", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on counting how many rows match an expression vs the number of all rows.\nThe fraction of these two counts is compared against a given expectation." + } + ] + }, + "description": "List of[[Expectation]] s to enforce when writing to this data object. Expectations are checks based on aggregates over all rows of a dataset." + }, + "jdbcFetchSize": { + "type": "integer", + "description": "Number of rows to be fetched together by the Jdbc driver" + }, + "saveMode": { + "type": "string", + "description": "[[SDLSaveMode]] to use when writing table, default is \\\"Overwrite\\\". Only \\\"Append\\\" and \\\"Overwrite\\\" supported.", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "allowSchemaEvolution": { + "type": "boolean", + "description": "If set to true schema evolution will automatically occur when writing to this DataObject with different schema, otherwise SDL will stop with error." + }, + "connectionId": { + "type": "string", + "description": "Id of JdbcConnection configuration" + }, + "jdbcOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Any jdbc options according to[[https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html]] .\nNote that some options above set and override some of this options explicitly.\nUse \\\"createTableOptions\\\" and \\\"createTableColumnTypes\\\" to control automatic creating of database tables." + }, + "virtualPartitions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Virtual partition columns. Note that this doesn\\'t need to be the same as the database partition\ncolumns for this table. But it is important that there is an index on these columns to efficiently\nlist existing \\\"partitions\\\"." + }, + "expectedPartitionsCondition": { + "type": "string", + "description": "Optional definition of partitions expected to exist.\nDefine a Spark SQL expression that is evaluated against a[[PartitionValues]] instance and returns true or false\nDefault is to expect all partitions to exist." + }, + "incrementalOutputExpr": { + "type": "string", + "description": "Optional expression to use for creating incremental output with DataObjectStateIncrementalMode.\nThe expression is used to get the high-water-mark for the incremental update state.\nNormally this can be just a column name, e.g. an id or updated timestamp which is continually increasing." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "JdbcTableDataObject", + "required": [ + "type", + "table", + "connectionId" + ], + "additionalProperties": false, + "description": "[[DataObject]]of type JDBC.\nProvides details for an action to read and write tables in a database through JDBC.\n\nNote that Sparks distributed processing can not directly write to a JDBC table in one transaction.\nJdbcTableDataObject implements this in one transaction by writing to a temporary-table with Spark,\nthen using a separate \\\"insert into ... select\\\" SQL statement to copy data into the final table.\n\nJdbcTableDataObject implements\n-[[CanMergeDataFrame]]by writing a temp table and using one SQL merge statement.\n-[[CanEvolveSchema]] by generating corresponding alter table DDL statements.\n- Overwriting partitions is implemented by using SQL delete and insert statement embedded in one transaction." + }, + "JmsDataObject": { + "type": "object", + "properties": { + "type": { + "const": "JmsDataObject" + }, + "jndiContextFactory": { + "type": "string", + "description": "JNDI Context Factory" + }, + "jndiProviderUrl": { + "type": "string", + "description": "JNDI Provider URL" + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "authentication information: for now BasicAuthMode is supported." + }, + "batchSize": { + "type": "integer", + "description": "JMS batch size" + }, + "maxWaitSec": { + "type": "integer" + }, + "maxBatchAgeSec": { + "type": "integer" + }, + "txBatchSize": { + "type": "integer" + }, + "connectionFactory": { + "type": "string", + "description": "JMS Connection Factory" + }, + "queue": { + "type": "string", + "description": "Name of MQ Queue" + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "JmsDataObject", + "required": [ + "type", + "jndiContextFactory", + "jndiProviderUrl", + "authMode", + "batchSize", + "maxWaitSec", + "maxBatchAgeSec", + "txBatchSize", + "connectionFactory", + "queue" + ], + "additionalProperties": false, + "description": "[[DataObject]] of type JMS queue.\nProvides details to an Action to access JMS queues." + }, + "JsonFileDataObject": { + "type": "object", + "properties": { + "type": { + "const": "JsonFileDataObject" + }, + "path": { + "type": "string", + "description": "The root path of the files that are handled by this DataObject.\nFor most DataObjects this can be a directory or a specific file." + }, + "jsonOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Settings for the underlying[[org.apache.spark.sql.DataFrameReader]]and\n[[org.apache.spark.sql.DataFrameWriter]] ." + }, + "partitions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Definition of partition columns" + }, + "schema": { + "type": "string", + "description": "An optional data object schema. If defined, any automatic schema inference is avoided.\nAs this corresponds to the schema on write, it must not include the optional filenameColumn on read.\nDefine the schema by using one of the schema providers DDL, jsonSchemaFile, xsdFile or caseClassName.\nThe schema provider and its configuration value must be provided in the format #.\nA DDL-formatted string is a comma separated list of field definitions, e.g., a INT, b STRING." + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "saveMode": { + "type": "string", + "description": "Overwrite or Append new data.\nWhen writing partitioned data, this applies only to partitions concerned.", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "sparkRepartition": { + "type": "object", + "properties": { + "numberOfTasksPerPartition": { + "type": "integer", + "description": "Number of Spark tasks to create per partition before writing to DataObject by repartitioning the DataFrame.\nThis controls how many files are created in each Hadoop partition." + }, + "keyCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional key columns to distribute records over Spark tasks inside a Hadoop partition.\nIf DataObject has Hadoop partitions defined, keyCols must be defined." + }, + "sortCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional columns to sort records inside files created." + }, + "filename": { + "type": "string", + "description": "Option filename to rename target file(s). If numberOfTasksPerPartition is greater than 1,\nmultiple files can exist in a directory and a number is inserted into the filename after the first \\'.\\'.\nExample: filename=data.csv -> files created are data.1.csv, data.2.csv, ..." + } + }, + "title": "SparkRepartitionDef", + "required": [ + "numberOfTasksPerPartition" + ], + "additionalProperties": false, + "description": "This controls repartitioning of the DataFrame before writing with Spark to Hadoop.\n\nWhen writing multiple partitions of a partitioned DataObject, the number of spark tasks created is equal to numberOfTasksPerPartition\nmultiplied with the number of partitions to write. To spread the records of a partition only over numberOfTasksPerPartition spark tasks,\nkeyCols must be given which are used to derive a task number inside the partition (hashvalue(keyCols) modulo numberOfTasksPerPartition).\n\nWhen writing to an unpartitioned DataObject or only one partition of a partitioned DataObject, the number of spark tasks created is equal\nto numberOfTasksPerPartition. Optional keyCols can be used to keep corresponding records together in the same task/file." + }, + "stringify": { + "type": "boolean", + "description": "Set the data type for all values to string." + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "connectionId": { + "type": "string", + "description": "Return the connection id.\n\nConnection defines path prefix (scheme, authority, base path) and ACL\\'s in central location." + }, + "filenameColumn": { + "type": "string", + "description": "The name of the (optional) additional column containing the source filename" + }, + "expectedPartitionsCondition": { + "type": "string", + "description": "Optional definition of partitions expected to exist.\nDefine a Spark SQL expression that is evaluated against a[[PartitionValues]] instance and returns true or false\nDefault is to expect all partitions to exist." + }, + "housekeepingMode": { + "oneOf": [ + { + "$ref": "#/definitions/HousekeepingMode/PartitionArchiveCompactionMode" + }, + { + "$ref": "#/definitions/HousekeepingMode/PartitionRetentionMode" + } + ], + "description": "Optional definition of a housekeeping mode applied after every write. E.g. it can be used to cleanup, archive and compact partitions.\nSee HousekeepingMode for available implementations. Default is None." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "JsonFileDataObject", + "required": [ + "type", + "path" + ], + "additionalProperties": false, + "description": "A[[io.smartdatalake.workflow.dataobject.DataObject]]backed by a JSON data source.\n\nIt manages read and write access and configurations required for[[io.smartdatalake.workflow.action.Action]]s to\nwork on JSON formatted files.\n\nReading and writing details are delegated to Apache Spark[[org.apache.spark.sql.DataFrameReader]]\nand[[org.apache.spark.sql.DataFrameWriter]] respectively.\n\nNOTE: By default, the JSON option`multiline` is enabled.\n\nSEE: [[org.apache.spark.sql.DataFrameReader]]\n\nSEE: [[org.apache.spark.sql.DataFrameWriter]]" + }, + "KafkaTopicDataObject": { + "type": "object", + "properties": { + "type": { + "const": "KafkaTopicDataObject" + }, + "topicName": { + "type": "string", + "description": "The name of the topic to read" + }, + "connectionId": { + "type": "string" + }, + "keyType": { + "type": "string", + "description": "Optional type the key column should be converted to. If none is given it will be interpreted as string.", + "enum": [ + "String ", + "Binary ", + "Json ", + "Avro ", + "JsonSchemaRegistry ", + "AvroSchemaRegistry " + ] + }, + "keySchema": { + "type": "string", + "description": "An optional schema for parsing the key column. This can be used if keyType = Json or Avro to parse the corresponding content.\nDefine the schema by using one of the schema providers DDL, jsonSchemaFile, avroSchemaFile, xsdFile or caseClassName.\nThe schema provider and its configuration value must be provided in the format #.\nA DDL-formatted string is a comma separated list of field definitions, e.g., a INT, b STRING." + }, + "valueType": { + "type": "string", + "description": "Optional type the value column should be converted to. If none is given it will be interpreted as string.", + "enum": [ + "String ", + "Binary ", + "Json ", + "Avro ", + "JsonSchemaRegistry ", + "AvroSchemaRegistry " + ] + }, + "valueSchema": { + "type": "string", + "description": "An optional schema for parsing the value column. This has to be specified if valueType = Json or Avro to parse the corresponding content.\nDefine the schema by using one of the schema providers DDL, jsonSchemaFile, avroSchemaFile, xsdFile or caseClassName.\nThe schema provider and its configuration value must be provided in the format #.\nA DDL-formatted string is a comma separated list of field definitions, e.g., a INT, b STRING." + }, + "allowSchemaEvolution": { + "type": "boolean", + "description": "If set to true schema evolution within schema registry will automatically occur when writing to this DataObject with different key or value schema, otherwise SDL will stop with error.\nThis only applies if keyType or valueType is set to Json/AvroSchemaRegistry.\nKafka Schema Evolution implementation will update schema if existing records with old schema can be read with new schema (backward compatible). Otherwise an IncompatibleSchemaException is thrown." + }, + "selectCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Columns to be selected when reading the DataFrame. Available columns are key, value, topic,\npartition, offset, timestamp, timestampType. If key/valueType is AvroSchemaRegistry the key/value column are\nconvert to a complex type according to the avro schema. To expand it select \\\"value.*\\\".\nDefault is to select key and value." + }, + "datePartitionCol": { + "type": "object", + "properties": { + "colName": { + "type": "string", + "description": "date partition column name to extract time into column on batch read" + }, + "timeFormat": { + "type": "string", + "description": "time format for timestamp in date partition column, definition according to java DateTimeFormatter. Default is \\\"yyyyMMdd\\\"." + }, + "timeUnit": { + "type": "string", + "description": "time unit for timestamp in date partition column, definition according to java ChronoUnit. Default is \\\"days\\\"." + }, + "timeZone": { + "type": "string", + "description": "time zone used for date logic. If not specified, java system default is used." + }, + "includeCurrentPartition": { + "type": "boolean", + "description": "If the current partition should be included. Default is to list only completed partitions.\nAttention: including the current partition might result in data loss if there is more data arriving.\nBut it might be useful to export all data before a scheduled maintenance." + } + }, + "title": "DatePartitionColumnDef", + "required": [ + "colName" + ], + "additionalProperties": false, + "description": "Definition of date partition column to extract formatted time into column." + }, + "batchReadConsecutivePartitionsAsRanges": { + "type": "boolean", + "description": "Set to true if consecutive partitions should be combined as one range of offsets when batch reading from topic. This results in less tasks but can be a performance problem when reading many partitions. (default=false)" + }, + "batchReadMaxOffsetsPerTask": { + "type": "integer", + "description": "Set number of offsets per Spark task when batch reading from topic." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options for the Kafka stream reader (see https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html).\nThese options override connection.options." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "KafkaTopicDataObject", + "required": [ + "type", + "topicName", + "connectionId" + ], + "additionalProperties": false, + "description": "[[DataObject]]of type KafkaTopic.\nProvides details to an action to read from Kafka Topics using either\n[[org.apache.spark.sql.DataFrameReader]]or[[org.apache.spark.sql.streaming.DataStreamReader]] \n\nKey & value schema can be automatically read from and written to confluent schema registry for Json and Avro.\nJson and Avro can also be parsed with a fixed schema.\n\nCan interpret record timestamp as SDLB partition values by setting datePartitionCol attribute. This allows to use this DataObject as input for PartitionDiffMode.\nThe DataObject does not support writing with SDLB partition values, as timestamp is autogenerated by Kafka using current time.\n\nSupport incremental output and use with DataObjectStateIncrementalMode." + }, + "PKViolatorsDataObject": { + "type": "object", + "properties": { + "type": { + "const": "PKViolatorsDataObject" + }, + "config": { + "type": "string", + "description": ": The config value can point to a configuration file or a directory containing configuration files." + }, + "flattenOutput": { + "type": "boolean", + "description": ": if true, key and data column are converted from type map to string (default)." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "PKViolatorsDataObject", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Checks for Primary Key violations for all[[DataObject]]s with Primary Keys defined that are registered in the current[[InstanceRegistry]].\nReturns the DataFrame of Primary Key violations.\n\nAlternatively, it can check for Primary Key violations of all[[DataObject]]s defined in config files. For this, the\nconfiguration \\\"config\\\" has to be set to the location of the config.\n\nExample:\n\n```\ndataObjects = {\n...\nprimarykey-violations {\ntype = PKViolatorsDataObject\nconfig = path/to/myconfiguration.conf\n}\n...\n}\n```\n\nSEE: Refer to[[ConfigLoader.loadConfigFromFilesystem()]] for details about the configuration loading." + }, + "ParquetFileDataObject": { + "type": "object", + "properties": { + "type": { + "const": "ParquetFileDataObject" + }, + "path": { + "type": "string", + "description": "Hadoop directory where this data object reads/writes it\\'s files.\nIf it doesn\\'t contain scheme and authority, the connections pathPrefix is applied. If pathPrefix is not\ndefined or doesn\\'t define scheme and authority, default schema and authority is applied.\nOptionally defined partitions are appended with hadoop standard partition layout to this path.\nOnly files ending with *.parquet* are considered as data for this DataObject." + }, + "partitions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "partition columns for this data object" + }, + "parquetOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Settings for the underlying[[org.apache.spark.sql.DataFrameReader]]and\n[[org.apache.spark.sql.DataFrameWriter]] ." + }, + "schema": { + "type": "string", + "description": "An optional schema for the spark data frame to be validated on read and write. Note: Existing Parquet files\ncontain a source schema. Therefore, this schema is ignored when reading from existing Parquet files.\nAs this corresponds to the schema on write, it must not include the optional filenameColumn on read.\nDefine the schema by using one of the schema providers DDL, jsonSchemaFile, xsdFile or caseClassName.\nThe schema provider and its configuration value must be provided in the format #.\nA DDL-formatted string is a comma separated list of field definitions, e.g., a INT, b STRING." + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "saveMode": { + "type": "string", + "description": "spark[[SaveMode]] to use when writing files, default is \\\"overwrite\\\"", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "sparkRepartition": { + "type": "object", + "properties": { + "numberOfTasksPerPartition": { + "type": "integer", + "description": "Number of Spark tasks to create per partition before writing to DataObject by repartitioning the DataFrame.\nThis controls how many files are created in each Hadoop partition." + }, + "keyCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional key columns to distribute records over Spark tasks inside a Hadoop partition.\nIf DataObject has Hadoop partitions defined, keyCols must be defined." + }, + "sortCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional columns to sort records inside files created." + }, + "filename": { + "type": "string", + "description": "Option filename to rename target file(s). If numberOfTasksPerPartition is greater than 1,\nmultiple files can exist in a directory and a number is inserted into the filename after the first \\'.\\'.\nExample: filename=data.csv -> files created are data.1.csv, data.2.csv, ..." + } + }, + "title": "SparkRepartitionDef", + "required": [ + "numberOfTasksPerPartition" + ], + "additionalProperties": false, + "description": "This controls repartitioning of the DataFrame before writing with Spark to Hadoop.\n\nWhen writing multiple partitions of a partitioned DataObject, the number of spark tasks created is equal to numberOfTasksPerPartition\nmultiplied with the number of partitions to write. To spread the records of a partition only over numberOfTasksPerPartition spark tasks,\nkeyCols must be given which are used to derive a task number inside the partition (hashvalue(keyCols) modulo numberOfTasksPerPartition).\n\nWhen writing to an unpartitioned DataObject or only one partition of a partitioned DataObject, the number of spark tasks created is equal\nto numberOfTasksPerPartition. Optional keyCols can be used to keep corresponding records together in the same task/file." + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "connectionId": { + "type": "string", + "description": "optional id of[[io.smartdatalake.workflow.connection.HadoopFileConnection]]" + }, + "filenameColumn": { + "type": "string", + "description": "The name of the (optional) additional column containing the source filename" + }, + "expectedPartitionsCondition": { + "type": "string", + "description": "Optional definition of partitions expected to exist.\nDefine a Spark SQL expression that is evaluated against a[[PartitionValues]] instance and returns true or false\nDefault is to expect all partitions to exist." + }, + "housekeepingMode": { + "oneOf": [ + { + "$ref": "#/definitions/HousekeepingMode/PartitionArchiveCompactionMode" + }, + { + "$ref": "#/definitions/HousekeepingMode/PartitionRetentionMode" + } + ], + "description": "Optional definition of a housekeeping mode applied after every write. E.g. it can be used to cleanup, archive and compact partitions.\nSee HousekeepingMode for available implementations. Default is None." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "ParquetFileDataObject", + "required": [ + "type", + "path" + ], + "additionalProperties": false, + "description": "A[[io.smartdatalake.workflow.dataobject.DataObject]]backed by an Apache Hive data source.\n\nIt manages read and write access and configurations required for[[io.smartdatalake.workflow.action.Action]]s to\nwork on Parquet formatted files.\n\nReading and writing details are delegated to Apache Spark[[org.apache.spark.sql.DataFrameReader]]\nand[[org.apache.spark.sql.DataFrameWriter]] respectively.\n\nSEE: [[org.apache.spark.sql.DataFrameReader]]\n\nSEE: [[org.apache.spark.sql.DataFrameWriter]]" + }, + "RawFileDataObject": { + "type": "object", + "properties": { + "type": { + "const": "RawFileDataObject" + }, + "path": { + "type": "string", + "description": "The root path of the files that are handled by this DataObject.\nFor most DataObjects this can be a directory or a specific file." + }, + "customFormat": { + "type": "string", + "description": "Custom Spark data source format, e.g. binaryFile or text. Only needed if you want to read/write this DataObject with Spark." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options for custom Spark data source format. Only of use if you want to read/write this DataObject with Spark." + }, + "fileName": { + "type": "string", + "description": "Definition of fileName. This is concatenated with path and partition layout to search for files. Default is an asterix to match everything." + }, + "partitions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Definition of partition columns" + }, + "schema": { + "type": "string", + "description": "An optional[[DataObject]]user-defined schema definition.\n\nSome[[DataObject]]s support optional schema inference.\nSpecifying this attribute disables automatic schema inference. When the wrapped data source contains a source\nschema, this`schema`attribute is ignored.\n\nNote: This is only used by the functionality defined in[[CanCreateDataFrame]], that is,\nwhen reading Spark data frames from the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that bypass Spark data frames ignore the`schema` attribute\nif it is defined." + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "saveMode": { + "type": "string", + "description": "Overwrite or Append new data.", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "sparkRepartition": { + "type": "object", + "properties": { + "numberOfTasksPerPartition": { + "type": "integer", + "description": "Number of Spark tasks to create per partition before writing to DataObject by repartitioning the DataFrame.\nThis controls how many files are created in each Hadoop partition." + }, + "keyCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional key columns to distribute records over Spark tasks inside a Hadoop partition.\nIf DataObject has Hadoop partitions defined, keyCols must be defined." + }, + "sortCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional columns to sort records inside files created." + }, + "filename": { + "type": "string", + "description": "Option filename to rename target file(s). If numberOfTasksPerPartition is greater than 1,\nmultiple files can exist in a directory and a number is inserted into the filename after the first \\'.\\'.\nExample: filename=data.csv -> files created are data.1.csv, data.2.csv, ..." + } + }, + "title": "SparkRepartitionDef", + "required": [ + "numberOfTasksPerPartition" + ], + "additionalProperties": false, + "description": "This controls repartitioning of the DataFrame before writing with Spark to Hadoop.\n\nWhen writing multiple partitions of a partitioned DataObject, the number of spark tasks created is equal to numberOfTasksPerPartition\nmultiplied with the number of partitions to write. To spread the records of a partition only over numberOfTasksPerPartition spark tasks,\nkeyCols must be given which are used to derive a task number inside the partition (hashvalue(keyCols) modulo numberOfTasksPerPartition).\n\nWhen writing to an unpartitioned DataObject or only one partition of a partitioned DataObject, the number of spark tasks created is equal\nto numberOfTasksPerPartition. Optional keyCols can be used to keep corresponding records together in the same task/file." + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "connectionId": { + "type": "string", + "description": "Return the connection id.\n\nConnection defines path prefix (scheme, authority, base path) and ACL\\'s in central location." + }, + "filenameColumn": { + "type": "string", + "description": "The name of the (optional) additional column containing the source filename" + }, + "expectedPartitionsCondition": { + "type": "string", + "description": "Optional definition of partitions expected to exist.\nDefine a Spark SQL expression that is evaluated against a[[PartitionValues]] instance and returns true or false\nDefault is to expect all partitions to exist." + }, + "housekeepingMode": { + "oneOf": [ + { + "$ref": "#/definitions/HousekeepingMode/PartitionArchiveCompactionMode" + }, + { + "$ref": "#/definitions/HousekeepingMode/PartitionRetentionMode" + } + ], + "description": "Optional definition of a housekeeping mode applied after every write. E.g. it can be used to cleanup, archive and compact partitions.\nSee HousekeepingMode for available implementations. Default is None." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "RawFileDataObject", + "required": [ + "type", + "path" + ], + "additionalProperties": false, + "description": "DataObject of type raw for files with unknown content.\nProvides details to an Action to access raw files.\nBy specifying format you can custom Spark data formats" + }, + "RelaxedCsvFileDataObject": { + "type": "object", + "properties": { + "type": { + "const": "RelaxedCsvFileDataObject" + }, + "path": { + "type": "string", + "description": "The root path of the files that are handled by this DataObject.\nFor most DataObjects this can be a directory or a specific file." + }, + "csvOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Settings for the underlying[[org.apache.spark.sql.DataFrameReader]]and[[org.apache.spark.sql.DataFrameWriter]] ." + }, + "partitions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Definition of partition columns" + }, + "schema": { + "type": "string", + "description": "The data object schema.\nDefine the schema by using one of the schema providers DDL, jsonSchemaFile, xsdFile or caseClassName.\nThe schema provider and its configuration value must be provided in the format #.\nA DDL-formatted string is a comma separated list of field definitions, e.g., a INT, b STRING." + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "dateColumnType": { + "type": "string", + "description": "Specifies the string format used for writing date typed data.", + "enum": [ + "Default ", + "String ", + "Date " + ] + }, + "treatMissingColumnsAsCorrupt": { + "type": "boolean", + "description": "If set to true records from files with missing columns in its header are treated as corrupt (default=false).\nCorrupt records are handled according to options.mode (default=permissive)." + }, + "treatSuperfluousColumnsAsCorrupt": { + "type": "boolean", + "description": "If set to true records from files with superfluous columns in its header are treated as corrupt (default=false).\nCorrupt records are handled according to options.mode (default=permissive)." + }, + "saveMode": { + "type": "string", + "description": "Overwrite or Append new data.\nWhen writing partitioned data, this applies only to partitions concerned.", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "sparkRepartition": { + "type": "object", + "properties": { + "numberOfTasksPerPartition": { + "type": "integer", + "description": "Number of Spark tasks to create per partition before writing to DataObject by repartitioning the DataFrame.\nThis controls how many files are created in each Hadoop partition." + }, + "keyCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional key columns to distribute records over Spark tasks inside a Hadoop partition.\nIf DataObject has Hadoop partitions defined, keyCols must be defined." + }, + "sortCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional columns to sort records inside files created." + }, + "filename": { + "type": "string", + "description": "Option filename to rename target file(s). If numberOfTasksPerPartition is greater than 1,\nmultiple files can exist in a directory and a number is inserted into the filename after the first \\'.\\'.\nExample: filename=data.csv -> files created are data.1.csv, data.2.csv, ..." + } + }, + "title": "SparkRepartitionDef", + "required": [ + "numberOfTasksPerPartition" + ], + "additionalProperties": false, + "description": "This controls repartitioning of the DataFrame before writing with Spark to Hadoop.\n\nWhen writing multiple partitions of a partitioned DataObject, the number of spark tasks created is equal to numberOfTasksPerPartition\nmultiplied with the number of partitions to write. To spread the records of a partition only over numberOfTasksPerPartition spark tasks,\nkeyCols must be given which are used to derive a task number inside the partition (hashvalue(keyCols) modulo numberOfTasksPerPartition).\n\nWhen writing to an unpartitioned DataObject or only one partition of a partitioned DataObject, the number of spark tasks created is equal\nto numberOfTasksPerPartition. Optional keyCols can be used to keep corresponding records together in the same task/file." + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "connectionId": { + "type": "string", + "description": "Return the connection id.\n\nConnection defines path prefix (scheme, authority, base path) and ACL\\'s in central location." + }, + "filenameColumn": { + "type": "string", + "description": "The name of the (optional) additional column containing the source filename" + }, + "expectedPartitionsCondition": { + "type": "string", + "description": "Optional definition of partitions expected to exist.\nDefine a Spark SQL expression that is evaluated against a[[PartitionValues]] instance and returns true or false\nDefault is to expect all partitions to exist." + }, + "housekeepingMode": { + "oneOf": [ + { + "$ref": "#/definitions/HousekeepingMode/PartitionArchiveCompactionMode" + }, + { + "$ref": "#/definitions/HousekeepingMode/PartitionRetentionMode" + } + ], + "description": "Optional definition of a housekeeping mode applied after every write. E.g. it can be used to cleanup, archive and compact partitions.\nSee HousekeepingMode for available implementations. Default is None." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "RelaxedCsvFileDataObject", + "required": [ + "type", + "path" + ], + "additionalProperties": false, + "description": "A[[DataObject]] which allows for more flexible CSV parsing.\nThe standard CsvFileDataObject doesnt support reading multiple CSV-Files with different column order, missing columns\nor additional columns.\nRelaxCsvFileDataObject works more like reading JSON-Files. You need to define a schema, then it tries to read every file\nwith that schema independently of the column order, adding missing columns and removing superfluous ones.\n\nCSV files are read by Spark as whole text files and then parsed manually with Sparks CSV parser class. You can therefore use the\nnormal CSV options of spark, but some properties are fixed, e.g. header=true, inferSchema=false, enforceSchema (ignored).\n\nNOTE: This data object sets the following default values for`csvOptions`: delimiter = \\\",\\\", quote = null\nAll other`csvOption` default to the values defined by Apache Spark.\n\nSEE: [[org.apache.spark.sql.DataFrameReader]]\n\nSEE: [[org.apache.spark.sql.DataFrameWriter]] \n\nIf mode is permissive you can retrieve the corrupt input record by adding as field to the schema.\nRelaxCsvFileDataObject also supports getting an error msg by adding \\\"_msg\\\" as field to the schema." + }, + "SFtpFileRefDataObject": { + "type": "object", + "properties": { + "type": { + "const": "SFtpFileRefDataObject" + }, + "path": { + "type": "string", + "description": "The root path of the files that are handled by this DataObject.\nFor most DataObjects this can be a directory or a specific file." + }, + "connectionId": { + "type": "string" + }, + "partitions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Definition of partition columns" + }, + "partitionLayout": { + "type": "string", + "description": "partition layout defines how partition values can be extracted from the path.\nUse \\\"%%\\\" as token to extract the value for a partition column.\nAs partition layout extracts partition from the path of individual files, it can also be used to extract partitions from the file name.\nWith \\\"%%\\\" a regex can be given to limit search. This is especially useful\nif there is no char to delimit the last token from the rest of the path or also between\ntwo tokens.\nBe careful that for directory based partition values extraction, the final path separator must be part\nof the partition layout to extract the last token correctly, e.g. \\\"%year%/\\\" for partitioning with yearly directories." + }, + "saveMode": { + "type": "string", + "description": "Overwrite or Append new data.", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "expectedPartitionsCondition": { + "type": "string", + "description": "Optional definition of partitions expected to exist.\nDefine a Spark SQL expression that is evaluated against a[[PartitionValues]] instance and returns true or false\nDefault is to expect all partitions to exist." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "SFtpFileRefDataObject", + "required": [ + "type", + "path", + "connectionId" + ], + "additionalProperties": false, + "description": "Connects to SFtp files\nNeeds java library \\\"com.hieronymus % sshj % 0.21.1\\\"\nThe following authentication mechanisms are supported\n-> public/private-key: private key must be saved in ~/.ssh, public key must be registered on server.\n-> user/pwd authentication: user and password is taken from two variables set as parameters.\nThese variables could come from clear text (CLEAR), a file (FILE) or an environment variable (ENV)" + }, + "SnowflakeTableDataObject": { + "type": "object", + "properties": { + "type": { + "const": "SnowflakeTableDataObject" + }, + "table": { + "$ref": "#/definitions/Others/Table" + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "constraints": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "name of the constraint" + }, + "description": { + "type": "string", + "description": "optional detailed description of the constraint" + }, + "expression": { + "type": "string", + "description": "SQL expression to evaluate on every row. The expressions return value should be a boolean.\nIf it evaluates to true the constraint is validated successfully, otherwise it will throw an exception." + }, + "errorMsgCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional list of column names to add to error message.\nNote that primary key columns are always included.\nIf there is no primary key defined, by default all columns with simple datatype are included in the error message." + } + }, + "title": "Constraint", + "required": [ + "name", + "expression" + ], + "additionalProperties": false, + "description": "Definition of row-level constraint to validate." + }, + "description": "List of row-level[[Constraint]] s to enforce when writing to this data object." + }, + "expectations": { + "type": "array", + "items": { + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "const": "AvgCountPerPartitionExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'> 100000\\\".\nIf no expectation is defined, the result value is just recorded in metrics." + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "AvgCountPerPartitionExpectation", + "required": [ + "name", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on the average number of records per partitions.\n\nNote that the scope for evaluating this expectation is fixed to Job." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLQueryExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "code": { + "type": "string", + "description": "a SQL query returning a single row. All column will be added as metrics.\nIf there are more than one column, there has to be one column with the same name as this expectation. This column will be used to compare against a potential condition of the expectation.\nThe special token %{inputViewName} must be used to insert the temporary view name used to provide the DataFrame to the query." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'= 0\\\".\nTogether with the result of the aggExpression evaluation on the left side, it forms the condition to validate the expectation.\nIf no expectation is defined, the aggExpression evaluation result is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLQueryExpectation", + "required": [ + "name", + "code", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on a SQL query to be evaluate on dataset-level.\nThe SQL query will be evaluated in a separate Spark job against the DataFrame." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "aggExpression": { + "type": "string", + "description": "SQL aggregate expression to evaluate on dataset, e.g. count(*)." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'= 0\\\".\nTogether with the result of the aggExpression evaluation on the left side, it forms the condition to validate the expectation.\nIf no expectation is defined, the aggExpression evaluation result is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLExpectation", + "required": [ + "name", + "aggExpression", + "type" + ], + "additionalProperties": false, + "description": "Definition of expectation based on a SQL aggregate expression to evaluate on dataset-level." + }, + { + "type": "object", + "properties": { + "type": { + "const": "ExpectationDefaultImpl" + } + }, + "title": "ExpectationDefaultImpl", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Default implementation for getValidationErrorColumn for metric of type`any` ." + }, + { + "type": "object", + "properties": { + "type": { + "const": "CountExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'> 100000\\\".\nIf no expectation is defined, the result value is is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "CountExpectation", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on the number of records." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLFractionExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "countConditionExpression": { + "type": "string", + "description": "SQL expression returning a boolean to match the rows to count for the fraction." + }, + "globalConditionExpression": { + "type": "string", + "description": "SQL expression returning a boolean used as global filter, e.g. fraction row count and total row count are filtered with global filter before counting." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected percentage for validation, e.g. \\'= 0.9\\\".\nIf no expectation is defined, the result value is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLFractionExpectation", + "required": [ + "name", + "countConditionExpression", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on counting how many rows match an expression vs the number of all rows.\nThe fraction of these two counts is compared against a given expectation." + } + ] + }, + "description": "List of[[Expectation]] s to enforce when writing to this data object. Expectations are checks based on aggregates over all rows of a dataset." + }, + "saveMode": { + "type": "string", + "description": "spark[[SDLSaveMode]] to use when writing files, default is \\\"overwrite\\\"", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "connectionId": { + "type": "string", + "description": "The SnowflakeTableConnection to use for the table" + }, + "comment": { + "type": "string", + "description": "An optional comment to add to the table after writing a DataFrame to it" + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "SnowflakeTableDataObject", + "required": [ + "type", + "table", + "connectionId" + ], + "additionalProperties": false, + "description": "[[DataObject]] of type SnowflakeTableDataObject.\nProvides details to access Snowflake tables via an action\nCan be used both for interacting with Snowflake through Spark with JDBC,\nas well as for actions written in the Snowpark API that run directly on Snowflake" + }, + "SplunkDataObject": { + "type": "object", + "properties": { + "type": { + "const": "SplunkDataObject" + }, + "params": { + "type": "object", + "properties": { + "query": { + "type": "string" + }, + "queryFrom": { + "type": "string", + "existingJavaType": "java.time.LocalDateTime" + }, + "queryTo": { + "type": "string", + "existingJavaType": "java.time.LocalDateTime" + }, + "queryTimeInterval": { + "type": "string", + "existingJavaType": "java.time.Duration" + }, + "columnNames": { + "type": "array", + "items": { + "type": "string" + } + }, + "parallelRequests": { + "type": "integer" + } + }, + "title": "SplunkParams", + "required": [ + "query", + "queryFrom", + "queryTo" + ], + "additionalProperties": false + }, + "connectionId": { + "type": "string" + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "SplunkDataObject", + "required": [ + "type", + "params", + "connectionId" + ], + "additionalProperties": false, + "description": "[[DataObject]] of type Splunk.\nProvides details to an action to access Splunk logs." + }, + "TickTockHiveTableDataObject": { + "type": "object", + "properties": { + "type": { + "const": "TickTockHiveTableDataObject" + }, + "path": { + "type": "string" + }, + "partitions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Definition of partition columns" + }, + "analyzeTableAfterWrite": { + "type": "boolean" + }, + "dateColumnType": { + "type": "string", + "enum": [ + "Default ", + "String ", + "Date " + ] + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "table": { + "$ref": "#/definitions/Others/Table" + }, + "constraints": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "name of the constraint" + }, + "description": { + "type": "string", + "description": "optional detailed description of the constraint" + }, + "expression": { + "type": "string", + "description": "SQL expression to evaluate on every row. The expressions return value should be a boolean.\nIf it evaluates to true the constraint is validated successfully, otherwise it will throw an exception." + }, + "errorMsgCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional list of column names to add to error message.\nNote that primary key columns are always included.\nIf there is no primary key defined, by default all columns with simple datatype are included in the error message." + } + }, + "title": "Constraint", + "required": [ + "name", + "expression" + ], + "additionalProperties": false, + "description": "Definition of row-level constraint to validate." + }, + "description": "List of constraint definitions to validate on write, see[[Constraint]] for details.\nConstraints are expressions defined on row-level and validated during evaluation of the DataFrame.\nIf validation fails an exception is thrown and further processing is stopped.\nNote that this is done while evaluating the DataFrame when writing to the DataObject. It doesn\\'t need a separate action on the DataFrame.\nIf a constraint validation for a row fails, it will throw an exception and abort writing to the DataObject." + }, + "expectations": { + "type": "array", + "items": { + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "const": "AvgCountPerPartitionExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'> 100000\\\".\nIf no expectation is defined, the result value is just recorded in metrics." + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "AvgCountPerPartitionExpectation", + "required": [ + "name", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on the average number of records per partitions.\n\nNote that the scope for evaluating this expectation is fixed to Job." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLQueryExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "code": { + "type": "string", + "description": "a SQL query returning a single row. All column will be added as metrics.\nIf there are more than one column, there has to be one column with the same name as this expectation. This column will be used to compare against a potential condition of the expectation.\nThe special token %{inputViewName} must be used to insert the temporary view name used to provide the DataFrame to the query." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'= 0\\\".\nTogether with the result of the aggExpression evaluation on the left side, it forms the condition to validate the expectation.\nIf no expectation is defined, the aggExpression evaluation result is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLQueryExpectation", + "required": [ + "name", + "code", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on a SQL query to be evaluate on dataset-level.\nThe SQL query will be evaluated in a separate Spark job against the DataFrame." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "aggExpression": { + "type": "string", + "description": "SQL aggregate expression to evaluate on dataset, e.g. count(*)." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'= 0\\\".\nTogether with the result of the aggExpression evaluation on the left side, it forms the condition to validate the expectation.\nIf no expectation is defined, the aggExpression evaluation result is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLExpectation", + "required": [ + "name", + "aggExpression", + "type" + ], + "additionalProperties": false, + "description": "Definition of expectation based on a SQL aggregate expression to evaluate on dataset-level." + }, + { + "type": "object", + "properties": { + "type": { + "const": "ExpectationDefaultImpl" + } + }, + "title": "ExpectationDefaultImpl", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Default implementation for getValidationErrorColumn for metric of type`any` ." + }, + { + "type": "object", + "properties": { + "type": { + "const": "CountExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'> 100000\\\".\nIf no expectation is defined, the result value is is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "CountExpectation", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on the number of records." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLFractionExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "countConditionExpression": { + "type": "string", + "description": "SQL expression returning a boolean to match the rows to count for the fraction." + }, + "globalConditionExpression": { + "type": "string", + "description": "SQL expression returning a boolean used as global filter, e.g. fraction row count and total row count are filtered with global filter before counting." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected percentage for validation, e.g. \\'= 0.9\\\".\nIf no expectation is defined, the result value is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLFractionExpectation", + "required": [ + "name", + "countConditionExpression", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on counting how many rows match an expression vs the number of all rows.\nThe fraction of these two counts is compared against a given expectation." + } + ] + }, + "description": "Map of expectation name and definition to evaluate on write, see[[Expectation]] for details.\nExpectations are aggregation expressions defined on dataset-level and evaluated on every write.\nBy default their result is logged with level info (ok) and error (failed), but this can be customized to be logged as warning.\nIn case of failed expectations logged as error, an exceptions is thrown and further processing is stopped.\nNote that the exception is thrown after writing to the DataObject is finished.\n\nThe following expectations names are reserved to create default metrics and should not be used:\n- count" + }, + "numInitialHdfsPartitions": { + "type": "integer" + }, + "saveMode": { + "type": "string", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "connectionId": { + "type": "string" + }, + "expectedPartitionsCondition": { + "type": "string", + "description": "Definition of partitions that are expected to exists.\nThis is used to validate that partitions being read exists and don\\'t return no data.\nDefine a Spark SQL expression that is evaluated against a[[PartitionValues]] instance and returns true or false\nexample: \\\"elements[\\'yourColName\\'] > 2017\\\"\n\nOTHERTAG: true if partition is expected to exist." + }, + "housekeepingMode": { + "oneOf": [ + { + "$ref": "#/definitions/HousekeepingMode/PartitionArchiveCompactionMode" + }, + { + "$ref": "#/definitions/HousekeepingMode/PartitionRetentionMode" + } + ], + "description": "Configure a housekeeping mode to e.g cleanup, archive and compact partitions.\nDefault is None." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "TickTockHiveTableDataObject", + "required": [ + "type", + "table" + ], + "additionalProperties": false + }, + "WebserviceFileDataObject": { + "type": "object", + "properties": { + "type": { + "const": "WebserviceFileDataObject" + }, + "url": { + "type": "string" + }, + "additionalHeaders": { + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "timeouts": { + "type": "object", + "properties": { + "connectionTimeoutMs": { + "type": "integer" + }, + "readTimeoutMs": { + "type": "integer" + } + }, + "title": "HttpTimeoutConfig", + "required": [ + "connectionTimeoutMs", + "readTimeoutMs" + ], + "additionalProperties": false + }, + "readTimeoutMs": { + "type": "integer" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ] + }, + "mimeType": { + "type": "string" + }, + "writeMethod": { + "type": "string", + "enum": [ + "Delete ", + "Put ", + "Post ", + "Get " + ] + }, + "proxy": { + "type": "object", + "properties": { + "host": { + "type": "string" + }, + "port": { + "type": "integer" + } + }, + "title": "HttpProxyConfig", + "required": [ + "host", + "port" + ], + "additionalProperties": false + }, + "followRedirects": { + "type": "boolean" + }, + "partitionDefs": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "values": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "title": "WebservicePartitionDefinition", + "required": [ + "name", + "values" + ], + "additionalProperties": false + }, + "description": "list of partitions with list of possible values for every entry" + }, + "partitionLayout": { + "type": "string", + "description": "definition of partitions in query string. Use %% as placeholder for partition column value in layout." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "WebserviceFileDataObject", + "required": [ + "type", + "url" + ], + "additionalProperties": false, + "description": "[[DataObject]] to call webservice and return response as InputStream\nThis is implemented as FileRefDataObject because the response is treated as some file content.\nFileRefDataObjects support partitioned data. For a WebserviceFileDataObject partitions are mapped as query parameters to create query string.\nAll possible query parameter values must be given in configuration." + }, + "XmlFileDataObject": { + "type": "object", + "properties": { + "type": { + "const": "XmlFileDataObject" + }, + "path": { + "type": "string", + "description": "The root path of the files that are handled by this DataObject.\nFor most DataObjects this can be a directory or a specific file." + }, + "rowTag": { + "type": "string" + }, + "xmlOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Settings for the underlying[[org.apache.spark.sql.DataFrameReader]]and[[org.apache.spark.sql.DataFrameWriter]] ." + }, + "partitions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Definition of partition columns" + }, + "schema": { + "type": "string", + "description": "An optional data object schema. If defined, any automatic schema inference is avoided.\nAs this corresponds to the schema on write, it must not include the optional filenameColumn on read.\nDefine the schema by using one of the schema providers DDL, jsonSchemaFile, xsdFile or caseClassName.\nThe schema provider and its configuration value must be provided in the format #.\nA DDL-formatted string is a comma separated list of field definitions, e.g., a INT, b STRING." + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "saveMode": { + "type": "string", + "description": "Overwrite or Append new data.\nWhen writing partitioned data, this applies only to partitions concerned.", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "sparkRepartition": { + "type": "object", + "properties": { + "numberOfTasksPerPartition": { + "type": "integer", + "description": "Number of Spark tasks to create per partition before writing to DataObject by repartitioning the DataFrame.\nThis controls how many files are created in each Hadoop partition." + }, + "keyCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional key columns to distribute records over Spark tasks inside a Hadoop partition.\nIf DataObject has Hadoop partitions defined, keyCols must be defined." + }, + "sortCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional columns to sort records inside files created." + }, + "filename": { + "type": "string", + "description": "Option filename to rename target file(s). If numberOfTasksPerPartition is greater than 1,\nmultiple files can exist in a directory and a number is inserted into the filename after the first \\'.\\'.\nExample: filename=data.csv -> files created are data.1.csv, data.2.csv, ..." + } + }, + "title": "SparkRepartitionDef", + "required": [ + "numberOfTasksPerPartition" + ], + "additionalProperties": false, + "description": "This controls repartitioning of the DataFrame before writing with Spark to Hadoop.\n\nWhen writing multiple partitions of a partitioned DataObject, the number of spark tasks created is equal to numberOfTasksPerPartition\nmultiplied with the number of partitions to write. To spread the records of a partition only over numberOfTasksPerPartition spark tasks,\nkeyCols must be given which are used to derive a task number inside the partition (hashvalue(keyCols) modulo numberOfTasksPerPartition).\n\nWhen writing to an unpartitioned DataObject or only one partition of a partitioned DataObject, the number of spark tasks created is equal\nto numberOfTasksPerPartition. Optional keyCols can be used to keep corresponding records together in the same task/file." + }, + "flatten": { + "type": "boolean" + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "connectionId": { + "type": "string", + "description": "Return the connection id.\n\nConnection defines path prefix (scheme, authority, base path) and ACL\\'s in central location." + }, + "filenameColumn": { + "type": "string", + "description": "The name of the (optional) additional column containing the source filename" + }, + "expectedPartitionsCondition": { + "type": "string", + "description": "Optional definition of partitions expected to exist.\nDefine a Spark SQL expression that is evaluated against a[[PartitionValues]] instance and returns true or false\nDefault is to expect all partitions to exist." + }, + "housekeepingMode": { + "oneOf": [ + { + "$ref": "#/definitions/HousekeepingMode/PartitionArchiveCompactionMode" + }, + { + "$ref": "#/definitions/HousekeepingMode/PartitionRetentionMode" + } + ], + "description": "Optional definition of a housekeeping mode applied after every write. E.g. it can be used to cleanup, archive and compact partitions.\nSee HousekeepingMode for available implementations. Default is None." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "XmlFileDataObject", + "required": [ + "type", + "path" + ], + "additionalProperties": false, + "description": "A[[DataObject]]backed by an XML data source.\n\nIt manages read and write access and configurations required for[[Action]]s to\nwork on XML formatted files.\n\nReading and writing details are delegated to Apache Spark[[org.apache.spark.sql.DataFrameReader]]\nand[[org.apache.spark.sql.DataFrameWriter]]respectively. The reader and writer implementations are provided by\nthe[[https://github.com/databricks/spark-xml databricks spark-xml]] project.\nNote that writing XML-file partitioned is not supported by spark-xml.\n\nSEE: [[org.apache.spark.sql.DataFrameReader]]\n\nSEE: [[org.apache.spark.sql.DataFrameWriter]]" + } + }, + "SaveModeOptions": { + "SaveModeGenericOptions": { + "type": "object", + "properties": { + "type": { + "const": "SaveModeGenericOptions" + }, + "saveMode": { + "type": "string", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + } + }, + "title": "SaveModeGenericOptions", + "required": [ + "type", + "saveMode" + ], + "additionalProperties": false, + "description": "This class can be used to override save mode without further special parameters." + }, + "SaveModeMergeOptions": { + "type": "object", + "properties": { + "type": { + "const": "SaveModeMergeOptions" + }, + "deleteCondition": { + "type": "string", + "description": "A condition to control if matched records are deleted. If no condition is given, *no* records are delete." + }, + "updateCondition": { + "type": "string", + "description": "A condition to control if matched records are updated. If no condition is given all matched records are updated (default).\nNote that delete is applied before update. Records selected for deletion are automatically excluded from the updates." + }, + "updateColumns": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of column names to update in update clause. If empty all columns (except primary keys) are updated (default)" + }, + "insertCondition": { + "type": "string", + "description": "A condition to control if unmatched records are inserted. If no condition is given all unmatched records are inserted (default)." + }, + "insertColumnsToIgnore": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of column names to ignore in insert clause. If empty all columns are inserted (default)." + }, + "insertValuesOverride": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Optional Map of column name and value expression to override value on insert. Value expressions have to be a sql expression string, e.g. true or \\'abc\\'." + }, + "additionalMergePredicate": { + "type": "string", + "description": "To optimize performance for SDLSaveMode.Merge it might be interesting to limit the records read from the existing table data, e.g. merge operation might use only the last 7 days." + } + }, + "title": "SaveModeMergeOptions", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Options to control detailed behaviour of SaveMode.Merge.\nIn Spark expressions use table alias \\'existing\\' to reference columns of the existing table data, and table alias \\'new\\' to reference columns of new data set." + } + }, + "ParsableScriptDef": { + "CmdScript": { + "type": "object", + "properties": { + "type": { + "const": "CmdScript" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "winCmd": { + "type": "string", + "description": "Cmd to execute on windows operating systems - note that it is executed with \\\"cmd /C\\\" prefixed" + }, + "linuxCmd": { + "type": "string", + "description": "Cmd to execute on linux operating systems - note that it is executed with \\\"sh -c\\\" prefixed." + } + }, + "title": "CmdScript", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Execute a command on the command line and get its std output\nCommand can be different for windows and linux operating systems, but it must be defined for at least one of them.\n\nIf return value is not zero an exception is thrown.\n\nNote about internal implementation: on execution value of parameter map entries where key starts with\n- \\'param\\' will be added as parameter after the docker run command, sorted by key.\nThis allows to customize execution behaviour through Actions or DataObjects using CmdScript." + }, + "DockerRunScript": { + "type": "object", + "properties": { + "type": { + "const": "DockerRunScript" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "image": { + "type": "string", + "description": "Docker image to run" + }, + "winDockerCmd": { + "type": "string", + "description": "Cmd to execute docker on windows operating systems. Default is \\'docker\\'." + }, + "linuxDockerCmd": { + "type": "string", + "description": "Cmd to execute docker on linux operating systems. Default is \\'docker\\'." + }, + "localDataDirToMount": { + "type": "string", + "description": "Optional directory that will be mounted as /mnt/data in the container. This is needed if your container wants to access files available in your local filesystem." + } + }, + "title": "DockerRunScript", + "required": [ + "type", + "image" + ], + "additionalProperties": false, + "description": "Run a docker image and get its std output.\n\nIf return value is not zero an exception is thrown.\n\nNote about internal implementation: on execution value of parameter map entries where key starts with\n- \\'runParam\\' will be added as parameter after the docker run command, sorted by their key.\n- \\'dockerParam\\' will be added as parameter for the docker command, e.g. before the image name in the docker run command, sorted by their key.\nThis allows to customize execution behaviour through Actions or DataObjects using CmdScript." + } + }, + "Others": { + "ActionMetadata": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "Readable name of the Action" + }, + "description": { + "type": "string", + "description": "Description of the content of the Action" + }, + "feed": { + "type": "string", + "description": "Name of the feed this Action belongs to" + }, + "tags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional custom tags for this object" + } + }, + "title": "ActionMetadata", + "additionalProperties": false, + "description": "Additional metadata for an Action" + }, + "ConnectionMetadata": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "Readable name of the Connection" + }, + "description": { + "type": "string", + "description": "Description of the content of the Connection" + }, + "layer": { + "type": "string", + "description": "Name of the layer this Connection belongs to" + }, + "subjectArea": { + "type": "string", + "description": "Name of the subject area this Connection belongs to" + }, + "tags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional custom tags for this object" + } + }, + "title": "ConnectionMetadata", + "additionalProperties": false, + "description": "Additional metadata for a Connection" + }, + "DataObjectMetadata": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "Readable name of the DataObject" + }, + "description": { + "type": "string", + "description": "Description of the content of the DataObject" + }, + "layer": { + "type": "string", + "description": "Name of the layer this DataObject belongs to" + }, + "subjectArea": { + "type": "string", + "description": "Name of the subject area this DataObject belongs to" + }, + "tags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional custom tags for this object" + } + }, + "title": "DataObjectMetadata", + "additionalProperties": false, + "description": "Additional metadata for a DataObject" + }, + "Table": { + "type": "object", + "properties": { + "db": { + "type": "string", + "description": "database-schema to be used for this table.\n If there exists a connection for the DataObject and this field is not defined, it will be set to the connections database value .\nCalled db for backwards-compatibility because for hive tables, db and schema mean the same thing." + }, + "name": { + "type": "string", + "description": "table name" + }, + "query": { + "type": "string", + "description": "optional select query" + }, + "primaryKey": { + "type": "array", + "items": { + "type": "string" + }, + "description": "optional sequence of primary key columns" + }, + "foreignKeys": { + "type": "array", + "items": { + "type": "object", + "properties": { + "db": { + "type": "string", + "description": "target database, if not defined it is assumed to be the same as the table owning the foreign key" + }, + "table": { + "type": "string", + "description": "referenced target table name" + }, + "columns": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "mapping of source column(s) to referenced target table column(s). The map is given\nas a list of objects with the following syntax: {\\\"local_column_name\\\" : \\\"external_column_name\\\"}" + }, + "name": { + "type": "string", + "description": "optional name for foreign key, e.g to depict it\\'s role.\n\n\nForeign keys in .conf files are to be defined like the following example \n(here two foreign key objects): \nforeignKeys = [\n{\ndb = \\\"OPTIONAL_DB_name\\\"\ntable = \\\"table_id\\\"\ncolumns = {\n\\\"local_column_name\\\": \\\"external_column_name\\\"\n}\nname = \\\"OPTIONAL_key_name\\\"\n},\n{\ntable = \\\"another_table_id\\\"\ncolumns = {\n\\\"another_local_column_name\\\": \\\"another_external_column_name\\\"\n}\nname = \\\"another_OPTIONAL_key_name\\\"\n}\n]" + } + }, + "title": "ForeignKey", + "required": [ + "table", + "columns" + ], + "additionalProperties": false, + "description": "Foreign key definition." + }, + "description": "optional sequence of foreign key definitions.\nThis is used as metadata for a data catalog.\nEach foreign key in the .conf files is an object with the following properties: \n{db: string, table: string , name: string map: Map[String]}, whereas a Map[String] is simply \na further object of the type {:string, :string}. For example: \nforeignKeys = [\n{\ndb = \\\"OPTIONAL_DB_name\\\" \ntable = \\\"table_id\\\" \ncolumns = { \n\\\"local_column_name\\\": \\\"external_column_name\\\" \n} \nname = \\\"OPTIONAL_key_name\\\" \n} \n]" + }, + "catalog": { + "type": "string", + "description": "Optional catalog to be used for this table. If null default catalog is used.\nIf there exists a connection with catalog value for the DataObject and this field is not defined, it will be set to the connections catalog value." + } + }, + "title": "Table", + "required": [ + "name" + ], + "additionalProperties": false, + "description": "Table attributes" + } + }, + "GenericDfTransformer": { + "AdditionalColumnsTransformer": { + "type": "object", + "properties": { + "type": { + "const": "AdditionalColumnsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "additionalColumns": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [column name, spark sql expression] to be added as additional columns to the dataframe.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]]and added to the DataFrame as literal columns.\n[[DefaultExpressionData]] contains informations from the context of the SDLB job, like runId or feed name." + }, + "additionalDerivedColumns": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [column name, spark sql expression] to be added as additional columns to the dataframe.\nThe spark sql expressions are evaluated against the input DataFrame and added to the DataFrame as derived columns." + } + }, + "title": "AdditionalColumnsTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Add additional columns to the DataFrame by extracting information from the context or derived from input columns." + }, + "BlacklistTransformer": { + "type": "object", + "properties": { + "type": { + "const": "BlacklistTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "columnBlacklist": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of columns to exclude from DataFrame" + } + }, + "title": "BlacklistTransformer", + "required": [ + "type", + "columnBlacklist" + ], + "additionalProperties": false, + "description": "Apply a column blacklist to a DataFrame." + }, + "DataValidationTransformer": { + "type": "object", + "properties": { + "type": { + "const": "DataValidationTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "rules": { + "type": "array", + "items": { + "$ref": "#/definitions/ValidationRule/RowLevelValidationRule" + }, + "description": "list of validation rules to apply to the DataFrame" + }, + "errorsColumn": { + "type": "string", + "description": "Optional column name for the list of error messages. Default is \\\"errors\\\"." + }, + "subFeedTypeForValidation": { + "type": "string", + "description": "For validating the rule expression, the runtime subFeedType is not yet known.\nBy default SparkSubFeed langauge is used, but you can configure a different one if needed." + } + }, + "title": "DataValidationTransformer", + "required": [ + "type", + "rules" + ], + "additionalProperties": false, + "description": "Apply validation rules to a DataFrame and collect potential violation error messages in a new column." + }, + "DecryptColumnsTransformer": { + "type": "object", + "properties": { + "type": { + "const": "DecryptColumnsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "decryptColumns": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of columns [columnA, columnB] to be encrypted" + }, + "keyVariable": { + "type": "string", + "description": "contains the id of the provider and the name of the secret with format #,\ne.g. ENV# to get a secret from an environment variable OR CLEAR#mYsEcReTkeY", + "deprecated": true + }, + "key": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "algorithm": { + "type": "string" + } + }, + "title": "DecryptColumnsTransformer", + "required": [ + "type", + "decryptColumns" + ], + "additionalProperties": false, + "description": "Decryption of specified columns using AES/GCM algorithm." + }, + "EncryptColumnsTransformer": { + "type": "object", + "properties": { + "type": { + "const": "EncryptColumnsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "encryptColumns": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of columns [columnA, columnB] to be encrypted" + }, + "keyVariable": { + "type": "string", + "description": "contains the id of the provider and the name of the secret with format #,\ne.g. ENV# to get a secret from an environment variable OR CLEAR#mYsEcReTkeY", + "deprecated": true + }, + "key": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + } + }, + "title": "EncryptColumnsTransformer", + "required": [ + "type", + "encryptColumns" + ], + "additionalProperties": false, + "description": "Encryption of specified columns using AES/GCM algorithm." + }, + "FilterTransformer": { + "type": "object", + "properties": { + "type": { + "const": "FilterTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "filterClause": { + "type": "string", + "description": "Spark SQL expression to filter the DataFrame" + }, + "subFeedTypeForValidation": { + "type": "string", + "description": "When parsing the configuration the runtime subFeedType for validating the filter expression is not yet known.\nBy default SparkSubFeed langauge is used, but you can configure a different one if needed." + } + }, + "title": "FilterTransformer", + "required": [ + "type", + "filterClause" + ], + "additionalProperties": false, + "description": "Apply a filter condition to a DataFrame." + }, + "PythonCodeDfTransformer": { + "type": "object", + "properties": { + "type": { + "const": "PythonCodeDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "code": { + "type": "string", + "description": "Optional python code to user for python transformation. The python code can use variables inputDf, dataObjectId and options. The transformed DataFrame has to be set with setOutputDf." + }, + "file": { + "type": "string", + "description": "Optional file with python code to use for python transformation. The python code can use variables inputDf, dataObjectId and options. The transformed DataFrame has to be set with setOutputDf." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "PythonCodeDfTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1) as Python/PySpark code.\nNote that this transformer needs a Python and PySpark environment installed.\nPySpark session is initialize and available under variables`sc`,`session`,`sqlContext`.\nOther variables available are\n-`inputDf`: Input DataFrame\n-`options`: Transformation options as Map[String,String]\n-`dataObjectId`: Id of input dataObject as String\nOutput DataFrame must be set with`setOutputDf(df)` ." + }, + "SQLDfTransformer": { + "type": "object", + "properties": { + "type": { + "const": "SQLDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "code": { + "type": "string", + "description": "SQL code for transformation.\nUse tokens %{} to replace with runtimeOptions in SQL code.\nExample: \\\"select * from test where run = %{runId}\\\"\nThe special token %{inputViewName} or ${inputViewName_} can be used to insert the temporary view name.\nThe input name is either the name of the DataObject, or the name of the previous transformation\nif this is not the first transformation of the chain. Make sure to change the standard name of\nthe previous transformation in that case." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "SQLDfTransformer", + "required": [ + "type", + "code" + ], + "additionalProperties": false, + "description": "Configuration of a custom GenericDataFrame transformation between one input and one output (1:1) as SQL code.\nThe input data is available as temporary view in SQL. The inputs name is either the name of the DataObject,\nor the name of the previous transformation, if this is not the first transformation of the chain. Also note that to create\nthe name of temporary view, special characters are replaced by underscores and a postfix \\\"_sdltemp\\\" is added.\nIt is therefore recommended to use special token %{inputViewName} or ${inputViewName_} that will be\nreplaced with the name of the temporary view at runtime." + }, + "ScalaClassGenericDfTransformer": { + "type": "object", + "properties": { + "type": { + "const": "ScalaClassGenericDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "className": { + "type": "string", + "description": "class name implementing trait[[CustomGenericDfTransformer]]" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaClassGenericDfTransformer", + "required": [ + "type", + "className" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1) as Java/Scala Class.\nDefine a transform function which receives a DataObjectId, a DataFrame and a map of options and has to return a\nDataFrame. The Java/Scala class has to implement interface[[CustomGenericDfTransformer]] ." + }, + "ScalaClassSnowparkDfTransformer": { + "type": "object", + "properties": { + "type": { + "const": "ScalaClassSnowparkDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "className": { + "type": "string", + "description": "class name implementing trait[[CustomSnowparkDfTransformer]]" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaClassSnowparkDfTransformer", + "required": [ + "type", + "className" + ], + "additionalProperties": false, + "description": "Configuration of a custom Snowpark-DataFrame transformation between one input and one output (1:1) as Java/Scala Class.\nDefine a transform function which receives a DataObjectId, a DataFrame and a map of options and has to return a\nDataFrame. The Java/Scala class has to implement interface[[CustomSnowparkDfTransformer]] ." + }, + "ScalaClassSparkDfTransformer": { + "type": "object", + "properties": { + "type": { + "const": "ScalaClassSparkDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "className": { + "type": "string", + "description": "class name implementing trait[[CustomDfTransformer]]" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaClassSparkDfTransformer", + "required": [ + "type", + "className" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1) as Java/Scala Class.\nDefine a transform function which receives a DataObjectId, a DataFrame and a map of options and has to return a\nDataFrame. The Java/Scala class has to implement interface[[CustomDfTransformer]] ." + }, + "ScalaClassSparkDsTransformer": { + "type": "object", + "properties": { + "type": { + "const": "ScalaClassSparkDsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "transformerClassName": { + "type": "string", + "description": "class name implementing trait[[CustomDsTransformer]]" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaClassSparkDsTransformer", + "required": [ + "type", + "transformerClassName" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-Dataset transformation between one input and one output (1:1) as Java/Scala Class.\nDefine a transform function which receives a SparkSession, a Dataset and a map of options and has to return a Dataset.\nThe Java/Scala class has to implement interface[[CustomDsTransformer]] ." + }, + "ScalaCodeSparkDfTransformer": { + "type": "object", + "properties": { + "type": { + "const": "ScalaCodeSparkDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "code": { + "type": "string", + "description": "Scala code for transformation. The scala code needs to be a function of type[[fnTransformType]] ." + }, + "file": { + "type": "string", + "description": "File where scala code for transformation is loaded from. The scala code in the file needs to be a function of type[[fnTransformType]] ." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaCodeSparkDfTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1) as Scala code which is compiled at runtime.\nDefine a transform function which receives a DataObjectId, a DataFrame and a map of options and has to return a\nDataFrame. The scala code has to implement a function of type[[fnTransformType]] ." + }, + "ScalaNotebookSparkDfTransformer": { + "type": "object", + "properties": { + "type": { + "const": "ScalaNotebookSparkDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "url": { + "type": "string", + "description": "Url to download notebook in IPYNB-format, which defines transformation." + }, + "functionName": { + "type": "string", + "description": "The notebook needs to contain a Scala-function with this name and type[[fnTransformType]] ." + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "optional authentication information for webservice, e.g. BasicAuthMode for user/pw authentication" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaNotebookSparkDfTransformer", + "required": [ + "type", + "url", + "functionName" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1) as Scala code which is compiled at runtime.\nThe code is loaded from a Notebook. It should define a transform function with a configurable name, which receives a DataObjectId, a DataFrame\nand a map of options and has to return a DataFrame, see also ([[fnTransformType]] ).\nNotebook-cells starting with \\\"//!IGNORE\\\" will be ignored." + }, + "SparkRepartitionTransformer": { + "type": "object", + "properties": { + "type": { + "const": "SparkRepartitionTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "numberOfTasksPerPartition": { + "type": "integer", + "description": "Number of Spark tasks to create per partition value by repartitioning the DataFrame." + }, + "keyCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional key columns to distribute records over Spark tasks inside a partition value." + } + }, + "title": "SparkRepartitionTransformer", + "required": [ + "type", + "numberOfTasksPerPartition" + ], + "additionalProperties": false, + "description": "Repartition DataFrame\nFor detailled description about repartitioning DataFrames see also[[SparkRepartitionDef]]" + }, + "StandardizeColNamesTransformer": { + "type": "object", + "properties": { + "type": { + "const": "StandardizeColNamesTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "camelCaseToLower": { + "type": "boolean", + "description": "If selected, converts Camel case names to lower case with underscores, i.e. TestString -> test_string, testABCtest -> test_ABCtest\nOtherwise converts just to lower case." + }, + "normalizeToAscii": { + "type": "boolean", + "description": "If selected, converts UTF-8 special characters (e.g. diacritics, umlauts) to ASCII chars (best effort), i.e. Öffi_émily -> Oeffi_emily" + }, + "removeNonStandardSQLNameChars": { + "type": "boolean", + "description": "Remove all chars from a string which dont belong to lowercase SQL standard naming characters, i.e abc$!-& -> abc" + } + }, + "title": "StandardizeColNamesTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Standardizes column names to be used without quoting by using camelCase to lower_case_with_underscore rule (default), and further cleanup rules for special characters (default).\nParameters below can be used to disable specific rules if needed." + }, + "StandardizeSparkDatatypesTransformer": { + "type": "object", + "properties": { + "type": { + "const": "StandardizeSparkDatatypesTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + } + }, + "title": "StandardizeSparkDatatypesTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Standardize datatypes of a Spark-DataFrame.\nCurrent implementation converts all decimal datatypes to a corresponding integral or float datatype" + }, + "WhitelistTransformer": { + "type": "object", + "properties": { + "type": { + "const": "WhitelistTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "columnWhitelist": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of columns to keep from DataFrame" + } + }, + "title": "WhitelistTransformer", + "required": [ + "type", + "columnWhitelist" + ], + "additionalProperties": false, + "description": "Apply a column whitelist to a DataFrame." + } + }, + "AuthMode": { + "AuthHeaderMode": { + "type": "object", + "properties": { + "type": { + "const": "AuthHeaderMode" + }, + "headerName": { + "type": "string" + }, + "secret": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "secretVariable": { + "type": "string", + "deprecated": true + } + }, + "title": "AuthHeaderMode", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Connect by custom authorization header" + }, + "BasicAuthMode": { + "type": "object", + "properties": { + "type": { + "const": "BasicAuthMode" + }, + "user": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "password": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "userVariable": { + "type": "string", + "deprecated": true + }, + "passwordVariable": { + "type": "string", + "deprecated": true + } + }, + "title": "BasicAuthMode", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Connect by basic authentication" + }, + "CustomHttpAuthMode": { + "type": "object", + "properties": { + "type": { + "const": "CustomHttpAuthMode" + }, + "className": { + "type": "string", + "description": "class name implementing trait[[CustomHttpAuthModeLogic]]" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "description": "Options to pass to the custom auth mode logic in prepare function.\n\nThe value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + } + }, + "title": "CustomHttpAuthMode", + "required": [ + "type", + "className", + "options" + ], + "additionalProperties": false, + "description": "Connect with custom HTTP authentication" + }, + "KeycloakClientSecretAuthMode": { + "type": "object", + "properties": { + "type": { + "const": "KeycloakClientSecretAuthMode" + }, + "ssoServer": { + "type": "string" + }, + "ssoRealm": { + "type": "string" + }, + "ssoGrantType": { + "type": "string" + }, + "clientIdVariable": { + "type": "string", + "deprecated": true + }, + "clientId": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "clientSecretVariable": { + "type": "string", + "deprecated": true + }, + "clientSecret": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + } + }, + "title": "KeycloakClientSecretAuthMode", + "required": [ + "type", + "ssoServer", + "ssoRealm", + "ssoGrantType" + ], + "additionalProperties": false, + "description": "Connect by using Keycloak to manage token and token refresh giving clientId/secret as information.\nFor HTTP Connection this is used as Bearer token in Authorization header." + }, + "PublicKeyAuthMode": { + "type": "object", + "properties": { + "type": { + "const": "PublicKeyAuthMode" + }, + "userVariable": { + "type": "string", + "deprecated": true + }, + "user": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + } + }, + "title": "PublicKeyAuthMode", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Validate by user and private/public key\nPrivate key is read from .ssh" + }, + "SASLSCRAMAuthMode": { + "type": "object", + "properties": { + "type": { + "const": "SASLSCRAMAuthMode" + }, + "username": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "passwordVariable": { + "type": "string", + "deprecated": true + }, + "password": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "sslMechanism": { + "type": "string" + }, + "truststorePath": { + "type": "string" + }, + "truststoreType": { + "type": "string" + }, + "truststorePassVariable": { + "type": "string", + "deprecated": true + }, + "truststorePass": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + } + }, + "title": "SASLSCRAMAuthMode", + "required": [ + "type", + "username", + "sslMechanism", + "truststorePath" + ], + "additionalProperties": false, + "description": "Validate by SASL_SSL Authentication : user / password and truststore" + }, + "SSLCertsAuthMode": { + "type": "object", + "properties": { + "type": { + "const": "SSLCertsAuthMode" + }, + "keystorePath": { + "type": "string" + }, + "keystoreType": { + "type": "string" + }, + "keystorePassVariable": { + "type": "string", + "deprecated": true + }, + "keystorePass": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "truststorePath": { + "type": "string" + }, + "truststoreType": { + "type": "string" + }, + "truststorePassVariable": { + "type": "string", + "deprecated": true + }, + "truststorePass": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + } + }, + "title": "SSLCertsAuthMode", + "required": [ + "type", + "keystorePath", + "truststorePath" + ], + "additionalProperties": false, + "description": "Validate by SSL Certificates : Only location an credentials. Additional attributes should be\nsupplied via options map" + }, + "TokenAuthMode": { + "type": "object", + "properties": { + "type": { + "const": "TokenAuthMode" + }, + "tokenVariable": { + "type": "string", + "deprecated": true + }, + "token": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + } + }, + "title": "TokenAuthMode", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Connect by token\nFor HTTP Connection this is used as Bearer token in Authorization header." + } + }, + "GenericDfsTransformer": { + "DfTransformerWrapperDfsTransformer": { + "type": "object", + "properties": { + "type": { + "const": "DfTransformerWrapperDfsTransformer" + }, + "transformer": { + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "const": "BlacklistTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "columnBlacklist": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of columns to exclude from DataFrame" + } + }, + "title": "BlacklistTransformer", + "required": [ + "columnBlacklist", + "type" + ], + "additionalProperties": false, + "description": "Apply a column blacklist to a DataFrame." + }, + { + "type": "object", + "properties": { + "type": { + "const": "StandardizeColNamesTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "camelCaseToLower": { + "type": "boolean", + "description": "If selected, converts Camel case names to lower case with underscores, i.e. TestString -> test_string, testABCtest -> test_ABCtest\nOtherwise converts just to lower case." + }, + "normalizeToAscii": { + "type": "boolean", + "description": "If selected, converts UTF-8 special characters (e.g. diacritics, umlauts) to ASCII chars (best effort), i.e. Öffi_émily -> Oeffi_emily" + }, + "removeNonStandardSQLNameChars": { + "type": "boolean", + "description": "Remove all chars from a string which dont belong to lowercase SQL standard naming characters, i.e abc$!-& -> abc" + } + }, + "title": "StandardizeColNamesTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Standardizes column names to be used without quoting by using camelCase to lower_case_with_underscore rule (default), and further cleanup rules for special characters (default).\nParameters below can be used to disable specific rules if needed." + }, + { + "type": "object", + "properties": { + "type": { + "const": "DataValidationTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "rules": { + "type": "array", + "items": { + "$ref": "#/definitions/ValidationRule/RowLevelValidationRule" + }, + "description": "list of validation rules to apply to the DataFrame" + }, + "errorsColumn": { + "type": "string", + "description": "Optional column name for the list of error messages. Default is \\\"errors\\\"." + }, + "subFeedTypeForValidation": { + "type": "string", + "description": "For validating the rule expression, the runtime subFeedType is not yet known.\nBy default SparkSubFeed langauge is used, but you can configure a different one if needed." + } + }, + "title": "DataValidationTransformer", + "required": [ + "rules", + "type" + ], + "additionalProperties": false, + "description": "Apply validation rules to a DataFrame and collect potential violation error messages in a new column." + }, + { + "type": "object", + "properties": { + "type": { + "const": "DecryptColumnsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "decryptColumns": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of columns [columnA, columnB] to be encrypted" + }, + "keyVariable": { + "type": "string", + "description": "contains the id of the provider and the name of the secret with format #,\ne.g. ENV# to get a secret from an environment variable OR CLEAR#mYsEcReTkeY", + "deprecated": true + }, + "key": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "algorithm": { + "type": "string" + } + }, + "title": "DecryptColumnsTransformer", + "required": [ + "decryptColumns", + "type" + ], + "additionalProperties": false, + "description": "Decryption of specified columns using AES/GCM algorithm." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SparkDfTransformer" + } + }, + "title": "SparkDfTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Interface to implement Spark-DataFrame transformers working with one input and one output (1:1)" + }, + { + "type": "object", + "properties": { + "type": { + "const": "OptionsSparkDfTransformer" + } + }, + "title": "OptionsSparkDfTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Interface to implement Spark-DataFrame transformers working with one input and one output (1:1) and options.\nThis trait extends OptionsGenericDfTransformer and passes a map of options as parameter to the transform function.\nThis is mainly used by custom transformers." + }, + { + "type": "object", + "properties": { + "type": { + "const": "ScalaClassSnowparkDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "className": { + "type": "string", + "description": "class name implementing trait[[CustomSnowparkDfTransformer]]" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaClassSnowparkDfTransformer", + "required": [ + "className", + "type" + ], + "additionalProperties": false, + "description": "Configuration of a custom Snowpark-DataFrame transformation between one input and one output (1:1) as Java/Scala Class.\nDefine a transform function which receives a DataObjectId, a DataFrame and a map of options and has to return a\nDataFrame. The Java/Scala class has to implement interface[[CustomSnowparkDfTransformer]] ." + }, + { + "type": "object", + "properties": { + "type": { + "const": "ScalaNotebookSparkDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "url": { + "type": "string", + "description": "Url to download notebook in IPYNB-format, which defines transformation." + }, + "functionName": { + "type": "string", + "description": "The notebook needs to contain a Scala-function with this name and type[[fnTransformType]] ." + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "optional authentication information for webservice, e.g. BasicAuthMode for user/pw authentication" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaNotebookSparkDfTransformer", + "required": [ + "url", + "functionName", + "type" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1) as Scala code which is compiled at runtime.\nThe code is loaded from a Notebook. It should define a transform function with a configurable name, which receives a DataObjectId, a DataFrame\nand a map of options and has to return a DataFrame, see also ([[fnTransformType]] ).\nNotebook-cells starting with \\\"//!IGNORE\\\" will be ignored." + }, + { + "type": "object", + "properties": { + "type": { + "const": "ScalaClassSparkDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "className": { + "type": "string", + "description": "class name implementing trait[[CustomDfTransformer]]" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaClassSparkDfTransformer", + "required": [ + "className", + "type" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1) as Java/Scala Class.\nDefine a transform function which receives a DataObjectId, a DataFrame and a map of options and has to return a\nDataFrame. The Java/Scala class has to implement interface[[CustomDfTransformer]] ." + }, + { + "type": "object", + "properties": { + "type": { + "const": "ScalaClassGenericDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "className": { + "type": "string", + "description": "class name implementing trait[[CustomGenericDfTransformer]]" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaClassGenericDfTransformer", + "required": [ + "className", + "type" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1) as Java/Scala Class.\nDefine a transform function which receives a DataObjectId, a DataFrame and a map of options and has to return a\nDataFrame. The Java/Scala class has to implement interface[[CustomGenericDfTransformer]] ." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "code": { + "type": "string", + "description": "SQL code for transformation.\nUse tokens %{} to replace with runtimeOptions in SQL code.\nExample: \\\"select * from test where run = %{runId}\\\"\nThe special token %{inputViewName} or ${inputViewName_} can be used to insert the temporary view name.\nThe input name is either the name of the DataObject, or the name of the previous transformation\nif this is not the first transformation of the chain. Make sure to change the standard name of\nthe previous transformation in that case." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "SQLDfTransformer", + "required": [ + "code", + "type" + ], + "additionalProperties": false, + "description": "Configuration of a custom GenericDataFrame transformation between one input and one output (1:1) as SQL code.\nThe input data is available as temporary view in SQL. The inputs name is either the name of the DataObject,\nor the name of the previous transformation, if this is not the first transformation of the chain. Also note that to create\nthe name of temporary view, special characters are replaced by underscores and a postfix \\\"_sdltemp\\\" is added.\nIt is therefore recommended to use special token %{inputViewName} or ${inputViewName_} that will be\nreplaced with the name of the temporary view at runtime." + }, + { + "type": "object", + "properties": { + "type": { + "const": "StandardizeSparkDatatypesTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + } + }, + "title": "StandardizeSparkDatatypesTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Standardize datatypes of a Spark-DataFrame.\nCurrent implementation converts all decimal datatypes to a corresponding integral or float datatype" + }, + { + "type": "object", + "properties": { + "type": { + "const": "EncryptColumnsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "encryptColumns": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of columns [columnA, columnB] to be encrypted" + }, + "keyVariable": { + "type": "string", + "description": "contains the id of the provider and the name of the secret with format #,\ne.g. ENV# to get a secret from an environment variable OR CLEAR#mYsEcReTkeY", + "deprecated": true + }, + "key": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + } + }, + "title": "EncryptColumnsTransformer", + "required": [ + "encryptColumns", + "type" + ], + "additionalProperties": false, + "description": "Encryption of specified columns using AES/GCM algorithm." + }, + { + "type": "object", + "properties": { + "type": { + "const": "ScalaClassSparkDsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "transformerClassName": { + "type": "string", + "description": "class name implementing trait[[CustomDsTransformer]]" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaClassSparkDsTransformer", + "required": [ + "transformerClassName", + "type" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-Dataset transformation between one input and one output (1:1) as Java/Scala Class.\nDefine a transform function which receives a SparkSession, a Dataset and a map of options and has to return a Dataset.\nThe Java/Scala class has to implement interface[[CustomDsTransformer]] ." + }, + { + "type": "object", + "properties": { + "type": { + "const": "ScalaCodeSparkDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "code": { + "type": "string", + "description": "Scala code for transformation. The scala code needs to be a function of type[[fnTransformType]] ." + }, + "file": { + "type": "string", + "description": "File where scala code for transformation is loaded from. The scala code in the file needs to be a function of type[[fnTransformType]] ." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaCodeSparkDfTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1) as Scala code which is compiled at runtime.\nDefine a transform function which receives a DataObjectId, a DataFrame and a map of options and has to return a\nDataFrame. The scala code has to implement a function of type[[fnTransformType]] ." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SparkRepartitionTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "numberOfTasksPerPartition": { + "type": "integer", + "description": "Number of Spark tasks to create per partition value by repartitioning the DataFrame." + }, + "keyCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional key columns to distribute records over Spark tasks inside a partition value." + } + }, + "title": "SparkRepartitionTransformer", + "required": [ + "numberOfTasksPerPartition", + "type" + ], + "additionalProperties": false, + "description": "Repartition DataFrame\nFor detailled description about repartitioning DataFrames see also[[SparkRepartitionDef]]" + }, + { + "type": "object", + "properties": { + "type": { + "const": "PythonCodeDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "code": { + "type": "string", + "description": "Optional python code to user for python transformation. The python code can use variables inputDf, dataObjectId and options. The transformed DataFrame has to be set with setOutputDf." + }, + "file": { + "type": "string", + "description": "Optional file with python code to use for python transformation. The python code can use variables inputDf, dataObjectId and options. The transformed DataFrame has to be set with setOutputDf." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "PythonCodeDfTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1) as Python/PySpark code.\nNote that this transformer needs a Python and PySpark environment installed.\nPySpark session is initialize and available under variables`sc`,`session`,`sqlContext`.\nOther variables available are\n-`inputDf`: Input DataFrame\n-`options`: Transformation options as Map[String,String]\n-`dataObjectId`: Id of input dataObject as String\nOutput DataFrame must be set with`setOutputDf(df)` ." + }, + { + "type": "object", + "properties": { + "type": { + "const": "OptionsGenericDfTransformer" + } + }, + "title": "OptionsGenericDfTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Interface to implement GenericDataFrame transformers working with one input and one output (1:1) and options.\nThis trait extends GenericDfTransformerDef to pass a map of options as parameter to the transform function.\nThis is mainly used by custom transformers." + }, + { + "type": "object", + "properties": { + "type": { + "const": "FilterTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "filterClause": { + "type": "string", + "description": "Spark SQL expression to filter the DataFrame" + }, + "subFeedTypeForValidation": { + "type": "string", + "description": "When parsing the configuration the runtime subFeedType for validating the filter expression is not yet known.\nBy default SparkSubFeed langauge is used, but you can configure a different one if needed." + } + }, + "title": "FilterTransformer", + "required": [ + "filterClause", + "type" + ], + "additionalProperties": false, + "description": "Apply a filter condition to a DataFrame." + }, + { + "type": "object", + "properties": { + "type": { + "const": "AdditionalColumnsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "additionalColumns": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [column name, spark sql expression] to be added as additional columns to the dataframe.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]]and added to the DataFrame as literal columns.\n[[DefaultExpressionData]] contains informations from the context of the SDLB job, like runId or feed name." + }, + "additionalDerivedColumns": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [column name, spark sql expression] to be added as additional columns to the dataframe.\nThe spark sql expressions are evaluated against the input DataFrame and added to the DataFrame as derived columns." + } + }, + "title": "AdditionalColumnsTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Add additional columns to the DataFrame by extracting information from the context or derived from input columns." + }, + { + "type": "object", + "properties": { + "type": { + "const": "WhitelistTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "columnWhitelist": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of columns to keep from DataFrame" + } + }, + "title": "WhitelistTransformer", + "required": [ + "columnWhitelist", + "type" + ], + "additionalProperties": false, + "description": "Apply a column whitelist to a DataFrame." + } + ], + "description": "Configuration for a GenericDfTransformerDef to be applied" + }, + "subFeedsToApply": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Names of SubFeeds the transformation should be applied to." + } + }, + "title": "DfTransformerWrapperDfsTransformer", + "required": [ + "type", + "transformer", + "subFeedsToApply" + ], + "additionalProperties": false, + "description": "A Transformer to use single DataFrame Transformers as multiple DataFrame Transformers.\nThis works by selecting the SubFeeds (DataFrames) the single DataFrame Transformer should be applied to.\nAll other SubFeeds will be passed through without transformation." + }, + "PythonCodeDfsTransformer": { + "type": "object", + "properties": { + "type": { + "const": "PythonCodeDfsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "code": { + "type": "string", + "description": "Optional python code to user for python transformation. The python code can use variables inputDfs and options. The transformed DataFrame has to be set with setOutputDfs." + }, + "file": { + "type": "string", + "description": "Optional file with python code to use for python transformation. The python code can use variables inputDfs and options. The transformed DataFrames has to be set with setOutputDfs." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "PythonCodeDfsTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between many inputs and many outputs (n:m) as Python/PySpark code.\nNote that this transformer needs a Python and PySpark environment installed.\nPySpark session is initialize and available under variables`sc`,`session`,`sqlContext`.\nOther variables available are\n-`inputDfs`: Input DataFrames\n-`options`: Transformation options as Map[String,String]\nOutput DataFrames must be set with`setOutputDfs(dict)` ." + }, + "SQLDfsTransformer": { + "type": "object", + "properties": { + "type": { + "const": "SQLDfsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "code": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Map of output names and corresponding SQL code for transformation.\nIf this is the last transformation in the chain, the output name has to match an output DataObject id,\notherwise it can be any name which will then be available in the next transformation.\nUse tokens %{} to replace with runtimeOptions in SQL code.\nExample: \\\"select * from test where run = %{runId}\\\"\nThe special token ${inputViewName_} can be used to insert the name of temporary views.\nThe input name is either the id of an input DataObject, or the name of an output of the previous transformation\nif this is not the first transformation of the chain." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "SQLDfsTransformer", + "required": [ + "type", + "code" + ], + "additionalProperties": false, + "description": "Configuration of a custom GenericDataFrame transformation between many inputs and many outputs (n:m) as SQL code.\nThe input data is available as temporary views in SQL. As name for the temporary views the input DataObjectId is used\n(special characters are replaces by underscores).\nThe input data is available as temporary view in SQL. The input name is either an id of the input DataObject,\nor the name of an output of the previous transformation if this is not the first transformation of the chain.\nAlso note that to create the name of temporary view, special characters are replaced by underscores and a postfix \\\"_sdltemp\\\" is added.\nIt is therefore recommended to use the special token ${inputViewName_}, that will be replaced with the name\nof the temporary view at runtime.\n\nNote that you can access arbitrary tables from the metastore in the SQL code, but this is against the principle of SDLB\nto access data through DataObjects. Accessing tables directly in SQL code has a negative impact on the maintainability of the project." + }, + "ScalaClassGenericDfsTransformer": { + "type": "object", + "properties": { + "type": { + "const": "ScalaClassGenericDfsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "className": { + "type": "string", + "description": "class name implementing trait[[CustomGenericDfsTransformer]]" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaClassGenericDfsTransformer", + "required": [ + "type", + "className" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between many inputs and many outputs (n:m)\nDefine a transform function which receives a map of input DataObjectIds with DataFrames and a map of options and as\nto return a map of output DataObjectIds with DataFrames, see also trait[[CustomGenericDfsTransformer]] ." + }, + "ScalaClassSnowparkDfsTransformer": { + "type": "object", + "properties": { + "type": { + "const": "ScalaClassSnowparkDfsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "className": { + "type": "string", + "description": "class name implementing trait[[CustomSnowparkDfsTransformer]]" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaClassSnowparkDfsTransformer", + "required": [ + "type", + "className" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between many inputs and many outputs (n:m)\nDefine a transform function which receives a map of input DataObjectIds with DataFrames and a map of options and as\nto return a map of output DataObjectIds with DataFrames, see also trait[[CustomSnowparkDfsTransformer]] ." + }, + "ScalaClassSparkDfsTransformer": { + "type": "object", + "properties": { + "type": { + "const": "ScalaClassSparkDfsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "className": { + "type": "string", + "description": "class name implementing trait[[CustomDfsTransformer]]" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaClassSparkDfsTransformer", + "required": [ + "type", + "className" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between many inputs and many outputs (n:m)\nDefine a transform function which receives a map of input DataObjectIds with DataFrames and a map of options and as\nto return a map of output DataObjectIds with DataFrames, see also trait[[CustomDfsTransformer]] ." + }, + "ScalaClassSparkDsNTo1Transformer": { + "type": "object", + "properties": { + "type": { + "const": "ScalaClassSparkDsNTo1Transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "className": { + "type": "string", + "description": "Class name implementing trait[[CustomDfsTransformer]]" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + }, + "parameterResolution": { + "type": "string", + "description": "By default parameter resolution for transform function uses input Datasets id to match the corresponding parameter name.\nBut there are other options, see[[ParameterResolution]] .", + "enum": [ + "DataObjectOrdering ", + "DataObjectId " + ] + }, + "strictInputValidation": { + "type": "boolean", + "description": "Enforce that the number of input dataobjects must be the same as the number of input datasets. False by default,\nbecause when chaining multiple transformations in the same action, you may not need all output Data objects of the previous transformations.\nHowever, having more input parameters in your transform method than Dataobjects will always fail." + }, + "inputColumnAutoSelect": { + "type": "boolean", + "description": "Determine if the input-datasets should contain exactly the columns defined by the corresponding case class (spark does not ensure this out of the box). True per default." + }, + "outputColumnAutoSelect": { + "type": "boolean", + "description": "Determine if the output-dataset should contain exactly the columns defined by the corresponding case class (spark does not ensure this out of the box). True per default." + }, + "addPartitionValuesToOutput": { + "type": "boolean", + "description": "If set to true and if one partition-value is processed at a time, the partition-columns will be added to the output-dataset\nIf more than one partition-value is processed simultaneously, the transformation will fail because it cannot\ndetermine which row should get which partition-value. False by default." + }, + "outputDatasetId": { + "type": "string", + "description": "Optional id of the output Dataset. Default is the id of the Actions first output DataObject." + } + }, + "title": "ScalaClassSparkDsNTo1Transformer", + "required": [ + "type", + "className" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-Dataset transformation between N inputs and 1 outputs (N:1) as Java/Scala Class\nDefine a transform function that receives a SparkSession, a map of options and as many DataSets as you want, and that has to return one Dataset.\nThe Java/Scala class has to implement interface[[CustomDsNto1Transformer]] ." + }, + "ScalaCodeSparkDfsTransformer": { + "type": "object", + "properties": { + "type": { + "const": "ScalaCodeSparkDfsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "code": { + "type": "string", + "description": "Scala code for transformation. The scala code needs to be a function of type[[fnTransformType]] ." + }, + "file": { + "type": "string", + "description": "File where scala code for transformation is loaded from. The scala code in the file needs to be a function of type[[fnTransformType]] ." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaCodeSparkDfsTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between many inputs and many outputs (n:m) as Scala code which is compiled at runtime.\nDefine a transform function which receives a map of input DataObjectIds with DataFrames and a map of options and has\nto return a map of output DataObjectIds with DataFrames. The scala code has to implement a function of type[[fnTransformType]] ." + } + }, + "Action": { + "CopyAction": { + "type": "object", + "properties": { + "type": { + "const": "CopyAction" + }, + "inputId": { + "type": "string", + "description": "inputs DataObject" + }, + "outputId": { + "type": "string", + "description": "output DataObject" + }, + "deleteDataAfterRead": { + "type": "boolean", + "description": "a flag to enable deletion of input partitions after copying." + }, + "transformer": { + "type": "object", + "properties": { + "className": { + "type": "string", + "description": "Optional class name implementing trait[[CustomDfTransformer]]" + }, + "scalaFile": { + "type": "string", + "description": "Optional file where scala code for transformation is loaded from. The scala code in the file needs to be a function of type[[fnTransformType]] ." + }, + "scalaCode": { + "type": "string", + "description": "Optional scala code for transformation. The scala code needs to be a function of type[[fnTransformType]] ." + }, + "sqlCode": { + "type": "string", + "description": "Optional SQL code for transformation.\nUse tokens %{} to replace with runtimeOptions in SQL code.\nExample: \\\"select * from test where run = %{runId}\\\"" + }, + "pythonFile": { + "type": "string", + "description": "Optional pythonFile to use for python transformation. The python code can use variables inputDf, dataObjectId and options. The transformed DataFrame has to be set with setOutputDf." + }, + "pythonCode": { + "type": "string", + "description": "Optional pythonCode to use for python transformation. The python code can use variables inputDf, dataObjectId and options. The transformed DataFrame has to be set with setOutputDf." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "CustomDfTransformerConfig", + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1)\nDefine a transform function which receives a DataObjectIds, a DataFrames and a map of options and has to return a\nDataFrame, see also[[CustomDfTransformer]].\n\nNote about Python transformation: Environment with Python and PySpark needed.\nPySpark session is initialize and available under variables`sc`,`session`,`sqlContext`.\nOther variables available are\n-`inputDf`: Input DataFrame\n-`options`: Transformation options as Map[String,String]\n-`dataObjectId`: Id of input dataObject as String\nOutput DataFrame must be set with`setOutputDf(df)` ." + }, + "transformers": { + "type": "array", + "items": { + "oneOf": [ + { + "$ref": "#/definitions/GenericDfTransformer/AdditionalColumnsTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/BlacklistTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/DataValidationTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/DecryptColumnsTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/EncryptColumnsTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/FilterTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/PythonCodeDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/SQLDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaClassGenericDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaClassSnowparkDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaClassSparkDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaClassSparkDsTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaCodeSparkDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaNotebookSparkDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/SparkRepartitionTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/StandardizeColNamesTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/StandardizeSparkDatatypesTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/WhitelistTransformer" + } + ] + }, + "description": "optional list of transformations to apply. See[[spark.transformer]] for a list of included Transformers.\nThe transformations are applied according to the lists ordering." + }, + "breakDataFrameLineage": { + "type": "boolean", + "description": "Stop propagating input DataFrame through action and instead get a new DataFrame from DataObject.\nThis can help to save memory and performance if the input DataFrame includes many transformations from previous Actions.\nThe new DataFrame will be initialized according to the SubFeed\\'s partitionValues." + }, + "persist": { + "type": "boolean", + "description": "Force persisting input DataFrame\\'s on Disk.\nThis improves performance if dataFrame is used multiple times in the transformation and can serve as a recovery point\nin case a task get\\'s lost.\nNote that DataFrames are persisted automatically by the previous Action if later Actions need the same data. To avoid this\nbehaviour set breakDataFrameLineage=false." + }, + "executionMode": { + "oneOf": [ + { + "$ref": "#/definitions/ExecutionMode/CustomMode" + }, + { + "$ref": "#/definitions/ExecutionMode/CustomPartitionMode" + }, + { + "$ref": "#/definitions/ExecutionMode/DataFrameIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/DataObjectStateIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/FailIfNoPartitionValuesMode" + }, + { + "$ref": "#/definitions/ExecutionMode/FileIncrementalMoveMode" + }, + { + "$ref": "#/definitions/ExecutionMode/KafkaStateIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/PartitionDiffMode" + }, + { + "$ref": "#/definitions/ExecutionMode/ProcessAllMode" + }, + { + "$ref": "#/definitions/ExecutionMode/SparkStreamingMode" + } + ], + "description": "optional execution mode for this Action" + }, + "executionCondition": { + "type": "object", + "properties": { + "expression": { + "type": "string", + "description": "Condition formulated as Spark SQL. The attributes available are dependent on the context." + }, + "description": { + "type": "string", + "description": "A textual description of the condition to be shown in error messages." + } + }, + "title": "Condition", + "required": [ + "expression" + ], + "additionalProperties": false, + "description": "Definition of a Spark SQL condition with description.\nThis is used for example to define failConditions of[[PartitionDiffMode]] ." + }, + "metricsFailCondition": { + "type": "string", + "description": "optional spark sql expression evaluated as where-clause against dataframe of metrics. Available columns are dataObjectId, key, value.\nIf there are any rows passing the where clause, a MetricCheckFailed exception is thrown." + }, + "saveModeOptions": { + "oneOf": [ + { + "$ref": "#/definitions/SaveModeOptions/SaveModeGenericOptions" + }, + { + "$ref": "#/definitions/SaveModeOptions/SaveModeMergeOptions" + } + ], + "description": "override and parametrize saveMode set in output DataObject configurations when writing to DataObjects." + }, + "metadata": { + "$ref": "#/definitions/Others/ActionMetadata" + }, + "agentId": { + "type": "string" + } + }, + "title": "CopyAction", + "required": [ + "type", + "inputId", + "outputId" + ], + "additionalProperties": false, + "description": "This[[Action]] copies data between an input and output DataObject using DataFrames.\nThe input DataObject reads the data and converts it to a DataFrame according to its definition.\nThe DataFrame might be transformed using SQL or DataFrame transformations.\nThen the output DataObjects writes the DataFrame to the output according to its definition." + }, + "CustomDataFrameAction": { + "type": "object", + "properties": { + "type": { + "const": "CustomDataFrameAction" + }, + "inputIds": { + "type": "array", + "items": { + "type": "string" + }, + "description": "input DataObject\\'s" + }, + "outputIds": { + "type": "array", + "items": { + "type": "string" + }, + "description": "output DataObject\\'s" + }, + "transformer": { + "type": "object", + "properties": { + "className": { + "type": "string", + "description": "Optional class name implementing trait[[CustomDfsTransformer]]" + }, + "scalaFile": { + "type": "string", + "description": "Optional file where scala code for transformation is loaded from. The scala code in the file needs to be a function of type[[fnTransformType]] ." + }, + "scalaCode": { + "type": "string", + "description": "Optional scala code for transformation. The scala code needs to be a function of type[[fnTransformType]] ." + }, + "sqlCode": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Optional map of output DataObject id and corresponding SQL Code.\nUse tokens %{} to replace with runtimeOptions in SQL code.\nExample: \\\"select * from test where run = %{runId}\\\"" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "CustomDfsTransformerConfig", + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between many inputs and many outputs (n:m).\nDefine a transform function which receives a map of input DataObjectIds with DataFrames and a map of options and has\nto return a map of output DataObjectIds with DataFrames, see also trait[[CustomDfsTransformer]] ." + }, + "transformers": { + "type": "array", + "items": { + "oneOf": [ + { + "$ref": "#/definitions/GenericDfsTransformer/DfTransformerWrapperDfsTransformer" + }, + { + "$ref": "#/definitions/GenericDfsTransformer/PythonCodeDfsTransformer" + }, + { + "$ref": "#/definitions/GenericDfsTransformer/SQLDfsTransformer" + }, + { + "$ref": "#/definitions/GenericDfsTransformer/ScalaClassGenericDfsTransformer" + }, + { + "$ref": "#/definitions/GenericDfsTransformer/ScalaClassSnowparkDfsTransformer" + }, + { + "$ref": "#/definitions/GenericDfsTransformer/ScalaClassSparkDfsTransformer" + }, + { + "$ref": "#/definitions/GenericDfsTransformer/ScalaClassSparkDsNTo1Transformer" + }, + { + "$ref": "#/definitions/GenericDfsTransformer/ScalaCodeSparkDfsTransformer" + } + ] + }, + "description": "list of transformations to apply. See[[spark.transformer]] for a list of included Transformers.\nThe transformations are applied according to the ordering of the list.\nNote that all outputs of previous transformers are kept as input for next transformer,\nbut in the end only outputs of the last transformer are mapped to output DataObjects." + }, + "breakDataFrameLineage": { + "type": "boolean", + "description": "Stop propagating input DataFrame through action and instead get a new DataFrame from DataObject.\nThis can help to save memory and performance if the input DataFrame includes many transformations from previous Actions.\nThe new DataFrame will be initialized according to the SubFeed\\'s partitionValues." + }, + "persist": { + "type": "boolean", + "description": "Force persisting input DataFrame\\'s on Disk.\nThis improves performance if dataFrame is used multiple times in the transformation and can serve as a recovery point\nin case a task get\\'s lost.\nNote that DataFrames are persisted automatically by the previous Action if later Actions need the same data. To avoid this\nbehaviour set breakDataFrameLineage=false." + }, + "mainInputId": { + "type": "string", + "description": "optional selection of main inputId used for execution mode and partition values propagation. Only needed if there are multiple input DataObject\\'s." + }, + "mainOutputId": { + "type": "string", + "description": "optional selection of main outputId used for execution mode and partition values propagation. Only needed if there are multiple output DataObject\\'s." + }, + "executionMode": { + "oneOf": [ + { + "$ref": "#/definitions/ExecutionMode/CustomMode" + }, + { + "$ref": "#/definitions/ExecutionMode/CustomPartitionMode" + }, + { + "$ref": "#/definitions/ExecutionMode/DataFrameIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/DataObjectStateIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/FailIfNoPartitionValuesMode" + }, + { + "$ref": "#/definitions/ExecutionMode/FileIncrementalMoveMode" + }, + { + "$ref": "#/definitions/ExecutionMode/KafkaStateIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/PartitionDiffMode" + }, + { + "$ref": "#/definitions/ExecutionMode/ProcessAllMode" + }, + { + "$ref": "#/definitions/ExecutionMode/SparkStreamingMode" + } + ], + "description": "optional execution mode for this Action" + }, + "executionCondition": { + "type": "object", + "properties": { + "expression": { + "type": "string", + "description": "Condition formulated as Spark SQL. The attributes available are dependent on the context." + }, + "description": { + "type": "string", + "description": "A textual description of the condition to be shown in error messages." + } + }, + "title": "Condition", + "required": [ + "expression" + ], + "additionalProperties": false, + "description": "Definition of a Spark SQL condition with description.\nThis is used for example to define failConditions of[[PartitionDiffMode]] ." + }, + "metricsFailCondition": { + "type": "string", + "description": "optional spark sql expression evaluated as where-clause against dataframe of metrics. Available columns are dataObjectId, key, value.\nIf there are any rows passing the where clause, a MetricCheckFailed exception is thrown." + }, + "metadata": { + "$ref": "#/definitions/Others/ActionMetadata" + }, + "recursiveInputIds": { + "type": "array", + "items": { + "type": "string" + }, + "description": "output of action that are used as input in the same action" + }, + "inputIdsToIgnoreFilter": { + "type": "array", + "items": { + "type": "string" + }, + "description": "optional list of input ids to ignore filter (partition values & filter clause)" + } + }, + "title": "CustomDataFrameAction", + "required": [ + "type", + "inputIds", + "outputIds" + ], + "additionalProperties": false, + "description": "This[[Action]] transforms data between many input and output DataObjects using DataFrames.\nCustomDataFrameAction allows to define transformations between n input DataObjects and m output DataObjects,\nbut is is recommended to implement n:1 or 1:m transformations, as otherwise dependencies between DataObjects might not be accurate anymore.\nThe input DataFrames might be transformed using SQL or DataFrame transformations.\nWhen chaining multiple transformers, output DataFrames of previous transformers are available as input DataFrames for later transformers by their corresponding name." + }, + "CustomFileAction": { + "type": "object", + "properties": { + "type": { + "const": "CustomFileAction" + }, + "inputId": { + "type": "string", + "description": "inputs DataObject" + }, + "outputId": { + "type": "string", + "description": "output DataObject" + }, + "transformer": { + "type": "object", + "properties": { + "className": { + "type": "string", + "description": "Optional class name to load transformer code from" + }, + "scalaFile": { + "type": "string", + "description": "Optional file where scala code for transformation is loaded from" + }, + "scalaCode": { + "type": "string", + "description": "Optional scala code for transformation" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + } + }, + "title": "CustomFileTransformerConfig", + "additionalProperties": false, + "description": "Configuration of custom file transformation between one input and one output (1:1)" + }, + "filesPerPartition": { + "type": "integer", + "description": "number of files per Spark partition" + }, + "breakFileRefLineage": { + "type": "boolean", + "description": "Stop propagating input FileRefs through action and instead get new FileRefs from DataObject according to the SubFeed\\'s partitionValue.\nThis is needed to reprocess all files of a path/partition instead of the FileRef\\'s passed from the previous Action." + }, + "executionMode": { + "oneOf": [ + { + "$ref": "#/definitions/ExecutionMode/CustomMode" + }, + { + "$ref": "#/definitions/ExecutionMode/CustomPartitionMode" + }, + { + "$ref": "#/definitions/ExecutionMode/DataFrameIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/DataObjectStateIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/FailIfNoPartitionValuesMode" + }, + { + "$ref": "#/definitions/ExecutionMode/FileIncrementalMoveMode" + }, + { + "$ref": "#/definitions/ExecutionMode/KafkaStateIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/PartitionDiffMode" + }, + { + "$ref": "#/definitions/ExecutionMode/ProcessAllMode" + }, + { + "$ref": "#/definitions/ExecutionMode/SparkStreamingMode" + } + ], + "description": "optional execution mode for this Action" + }, + "executionCondition": { + "type": "object", + "properties": { + "expression": { + "type": "string", + "description": "Condition formulated as Spark SQL. The attributes available are dependent on the context." + }, + "description": { + "type": "string", + "description": "A textual description of the condition to be shown in error messages." + } + }, + "title": "Condition", + "required": [ + "expression" + ], + "additionalProperties": false, + "description": "Definition of a Spark SQL condition with description.\nThis is used for example to define failConditions of[[PartitionDiffMode]] ." + }, + "metricsFailCondition": { + "type": "string", + "description": "optional spark sql expression evaluated as where-clause against dataframe of metrics. Available columns are dataObjectId, key, value.\nIf there are any rows passing the where clause, a MetricCheckFailed exception is thrown." + }, + "metadata": { + "$ref": "#/definitions/Others/ActionMetadata" + } + }, + "title": "CustomFileAction", + "required": [ + "type", + "inputId", + "outputId", + "transformer" + ], + "additionalProperties": false, + "description": "[[Action]] to transform files between two Hadoop Data Objects.\nThe transformation is executed in distributed mode on the Spark executors.\nA custom file transformer must be given, which reads a file from Hadoop and writes it back to Hadoop." + }, + "CustomScriptAction": { + "type": "object", + "properties": { + "type": { + "const": "CustomScriptAction" + }, + "inputIds": { + "type": "array", + "items": { + "type": "string" + }, + "description": "input DataObject\\'s" + }, + "outputIds": { + "type": "array", + "items": { + "type": "string" + }, + "description": "output DataObject\\'s" + }, + "scripts": { + "type": "array", + "items": { + "oneOf": [ + { + "$ref": "#/definitions/ParsableScriptDef/CmdScript" + }, + { + "$ref": "#/definitions/ParsableScriptDef/DockerRunScript" + } + ] + }, + "description": "definition of scripts to execute" + }, + "executionCondition": { + "type": "object", + "properties": { + "expression": { + "type": "string", + "description": "Condition formulated as Spark SQL. The attributes available are dependent on the context." + }, + "description": { + "type": "string", + "description": "A textual description of the condition to be shown in error messages." + } + }, + "title": "Condition", + "required": [ + "expression" + ], + "additionalProperties": false, + "description": "Definition of a Spark SQL condition with description.\nThis is used for example to define failConditions of[[PartitionDiffMode]] ." + }, + "metadata": { + "$ref": "#/definitions/Others/ActionMetadata" + } + }, + "title": "CustomScriptAction", + "required": [ + "type", + "inputIds", + "outputIds" + ], + "additionalProperties": false, + "description": "[[Action]] execute script after multiple input DataObjects are ready, notifying multiple output DataObjects when script succeeded.\n\nNote that this action can also be used to give your data pipeline additional structure, e.g. adding a decision point after several actions have been executed." + }, + "DeduplicateAction": { + "type": "object", + "properties": { + "type": { + "const": "DeduplicateAction" + }, + "inputId": { + "type": "string", + "description": "inputs DataObject" + }, + "outputId": { + "type": "string", + "description": "output DataObject" + }, + "transformer": { + "type": "object", + "properties": { + "className": { + "type": "string", + "description": "Optional class name implementing trait[[CustomDfTransformer]]" + }, + "scalaFile": { + "type": "string", + "description": "Optional file where scala code for transformation is loaded from. The scala code in the file needs to be a function of type[[fnTransformType]] ." + }, + "scalaCode": { + "type": "string", + "description": "Optional scala code for transformation. The scala code needs to be a function of type[[fnTransformType]] ." + }, + "sqlCode": { + "type": "string", + "description": "Optional SQL code for transformation.\nUse tokens %{} to replace with runtimeOptions in SQL code.\nExample: \\\"select * from test where run = %{runId}\\\"" + }, + "pythonFile": { + "type": "string", + "description": "Optional pythonFile to use for python transformation. The python code can use variables inputDf, dataObjectId and options. The transformed DataFrame has to be set with setOutputDf." + }, + "pythonCode": { + "type": "string", + "description": "Optional pythonCode to use for python transformation. The python code can use variables inputDf, dataObjectId and options. The transformed DataFrame has to be set with setOutputDf." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "CustomDfTransformerConfig", + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1)\nDefine a transform function which receives a DataObjectIds, a DataFrames and a map of options and has to return a\nDataFrame, see also[[CustomDfTransformer]].\n\nNote about Python transformation: Environment with Python and PySpark needed.\nPySpark session is initialize and available under variables`sc`,`session`,`sqlContext`.\nOther variables available are\n-`inputDf`: Input DataFrame\n-`options`: Transformation options as Map[String,String]\n-`dataObjectId`: Id of input dataObject as String\nOutput DataFrame must be set with`setOutputDf(df)` ." + }, + "transformers": { + "type": "array", + "items": { + "oneOf": [ + { + "$ref": "#/definitions/GenericDfTransformer/AdditionalColumnsTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/BlacklistTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/DataValidationTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/DecryptColumnsTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/EncryptColumnsTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/FilterTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/PythonCodeDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/SQLDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaClassGenericDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaClassSnowparkDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaClassSparkDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaClassSparkDsTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaCodeSparkDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaNotebookSparkDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/SparkRepartitionTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/StandardizeColNamesTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/StandardizeSparkDatatypesTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/WhitelistTransformer" + } + ] + }, + "description": "optional list of transformations to apply before deduplication. See[[sparktransformer]] for a list of included Transformers.\nThe transformations are applied according to the lists ordering." + }, + "ignoreOldDeletedColumns": { + "type": "boolean", + "description": "if true, remove no longer existing columns in Schema Evolution" + }, + "ignoreOldDeletedNestedColumns": { + "type": "boolean", + "description": "if true, remove no longer existing columns from nested data types in Schema Evolution.\nKeeping deleted columns in complex data types has performance impact as all new data\nin the future has to be converted by a complex function." + }, + "updateCapturedColumnOnlyWhenChanged": { + "type": "boolean", + "description": "Set to true to enable update Column[[TechnicalTableColumn.captured]] only if Record has changed in the source, instead of updating it with every execution (default=false).\nThis results in much less records updated with saveMode.Merge." + }, + "mergeModeEnable": { + "type": "boolean", + "description": "Set to true to use saveMode.Merge for much better performance. Output DataObject must implement[[CanMergeDataFrame]] if enabled (default = false)." + }, + "mergeModeAdditionalJoinPredicate": { + "type": "string", + "description": "To optimize performance it might be interesting to limit the records read from the existing table data, e.g. it might be sufficient to use only the last 7 days.\nSpecify a condition to select existing data to be used in transformation as Spark SQL expression.\nUse table alias \\'existing\\' to reference columns of the existing table data." + }, + "breakDataFrameLineage": { + "type": "boolean", + "description": "Stop propagating input DataFrame through action and instead get a new DataFrame from DataObject.\nThis can help to save memory and performance if the input DataFrame includes many transformations from previous Actions.\nThe new DataFrame will be initialized according to the SubFeed\\'s partitionValues." + }, + "persist": { + "type": "boolean", + "description": "Force persisting input DataFrame\\'s on Disk.\nThis improves performance if dataFrame is used multiple times in the transformation and can serve as a recovery point\nin case a task get\\'s lost.\nNote that DataFrames are persisted automatically by the previous Action if later Actions need the same data. To avoid this\nbehaviour set breakDataFrameLineage=false." + }, + "executionMode": { + "oneOf": [ + { + "$ref": "#/definitions/ExecutionMode/CustomMode" + }, + { + "$ref": "#/definitions/ExecutionMode/CustomPartitionMode" + }, + { + "$ref": "#/definitions/ExecutionMode/DataFrameIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/DataObjectStateIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/FailIfNoPartitionValuesMode" + }, + { + "$ref": "#/definitions/ExecutionMode/FileIncrementalMoveMode" + }, + { + "$ref": "#/definitions/ExecutionMode/KafkaStateIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/PartitionDiffMode" + }, + { + "$ref": "#/definitions/ExecutionMode/ProcessAllMode" + }, + { + "$ref": "#/definitions/ExecutionMode/SparkStreamingMode" + } + ], + "description": "optional execution mode for this Action" + }, + "executionCondition": { + "type": "object", + "properties": { + "expression": { + "type": "string", + "description": "Condition formulated as Spark SQL. The attributes available are dependent on the context." + }, + "description": { + "type": "string", + "description": "A textual description of the condition to be shown in error messages." + } + }, + "title": "Condition", + "required": [ + "expression" + ], + "additionalProperties": false, + "description": "Definition of a Spark SQL condition with description.\nThis is used for example to define failConditions of[[PartitionDiffMode]] ." + }, + "metricsFailCondition": { + "type": "string", + "description": "optional spark sql expression evaluated as where-clause against dataframe of metrics. Available columns are dataObjectId, key, value.\nIf there are any rows passing the where clause, a MetricCheckFailed exception is thrown." + }, + "metadata": { + "$ref": "#/definitions/Others/ActionMetadata" + } + }, + "title": "DeduplicateAction", + "required": [ + "type", + "inputId", + "outputId" + ], + "additionalProperties": false, + "description": "This[[Action]]copies and deduplicates data between an input and output DataObject using DataFrames.\nDeduplication keeps the last record for every key, also after it has been deleted in the source.\nThe DataFrame might be transformed using SQL or DataFrame transformations. These transformations are applied before the deduplication.\n\nDeduplicateAction adds an additional Column[[TechnicalTableColumn.captured]]. It contains the timestamp of the last occurrence of the record in the source.\nThis creates lots of updates. Especially when using saveMode.Merge it is better to set[[TechnicalTableColumn.captured]]to the last change of the record in the source. Use updateCapturedColumnOnlyWhenChanged = true to enable this optimization.\n\nDeduplicateAction needs a transactional table (e.g.[[TransactionalTableDataObject]]) as output with defined primary keys.\nIf output implements[[CanMergeDataFrame]] , saveMode.Merge can be enabled by setting mergeModeEnable = true. This allows for much better performance." + }, + "FileTransferAction": { + "type": "object", + "properties": { + "type": { + "const": "FileTransferAction" + }, + "inputId": { + "type": "string", + "description": "inputs DataObject" + }, + "outputId": { + "type": "string", + "description": "output DataObject" + }, + "overwrite": { + "type": "boolean", + "description": "Allow existing output file to be overwritten. If false the action will fail if a file to be created already exists. Default is true." + }, + "maxParallelism": { + "type": "integer", + "description": "Set maximum of files to be transferred in parallel.\nNote that this information can also be set on DataObjects like SFtpFileRefDataObject, resp. its SFtpFileRefConnection.\nThe FileTransferAction will then take the minimum parallelism of input, output and this attribute.\nIf parallelism is not specified on input, output and this attribute, it is set to 1." + }, + "filenameExtractorRegex": { + "type": "string", + "description": "A regex to extract a part of the filename to keep in the translated FileRef.\nIf the regex contains group definitions, the first group is taken, otherwise the whole regex match.\nDefault is None which keeps the whole filename (without path)." + }, + "breakFileRefLineage": { + "type": "boolean", + "description": "If set to true, file references passed on from previous action are ignored by this action.\nThe action will detect on its own what files it is going to process." + }, + "executionMode": { + "oneOf": [ + { + "$ref": "#/definitions/ExecutionMode/CustomMode" + }, + { + "$ref": "#/definitions/ExecutionMode/CustomPartitionMode" + }, + { + "$ref": "#/definitions/ExecutionMode/DataFrameIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/DataObjectStateIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/FailIfNoPartitionValuesMode" + }, + { + "$ref": "#/definitions/ExecutionMode/FileIncrementalMoveMode" + }, + { + "$ref": "#/definitions/ExecutionMode/KafkaStateIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/PartitionDiffMode" + }, + { + "$ref": "#/definitions/ExecutionMode/ProcessAllMode" + }, + { + "$ref": "#/definitions/ExecutionMode/SparkStreamingMode" + } + ], + "description": "optional execution mode for this Action" + }, + "executionCondition": { + "type": "object", + "properties": { + "expression": { + "type": "string", + "description": "Condition formulated as Spark SQL. The attributes available are dependent on the context." + }, + "description": { + "type": "string", + "description": "A textual description of the condition to be shown in error messages." + } + }, + "title": "Condition", + "required": [ + "expression" + ], + "additionalProperties": false, + "description": "Definition of a Spark SQL condition with description.\nThis is used for example to define failConditions of[[PartitionDiffMode]] ." + }, + "metricsFailCondition": { + "type": "string", + "description": "optional spark sql expression evaluated as where-clause against dataframe of metrics. Available columns are dataObjectId, key, value.\nIf there are any rows passing the where clause, a MetricCheckFailed exception is thrown." + }, + "metadata": { + "$ref": "#/definitions/Others/ActionMetadata" + } + }, + "title": "FileTransferAction", + "required": [ + "type", + "inputId", + "outputId" + ], + "additionalProperties": false, + "description": "[[Action]] to transfer files between SFtp, Hadoop and local Fs." + }, + "HistorizeAction": { + "type": "object", + "properties": { + "type": { + "const": "HistorizeAction" + }, + "inputId": { + "type": "string", + "description": "inputs DataObject" + }, + "outputId": { + "type": "string", + "description": "output DataObject" + }, + "transformer": { + "type": "object", + "properties": { + "className": { + "type": "string", + "description": "Optional class name implementing trait[[CustomDfTransformer]]" + }, + "scalaFile": { + "type": "string", + "description": "Optional file where scala code for transformation is loaded from. The scala code in the file needs to be a function of type[[fnTransformType]] ." + }, + "scalaCode": { + "type": "string", + "description": "Optional scala code for transformation. The scala code needs to be a function of type[[fnTransformType]] ." + }, + "sqlCode": { + "type": "string", + "description": "Optional SQL code for transformation.\nUse tokens %{} to replace with runtimeOptions in SQL code.\nExample: \\\"select * from test where run = %{runId}\\\"" + }, + "pythonFile": { + "type": "string", + "description": "Optional pythonFile to use for python transformation. The python code can use variables inputDf, dataObjectId and options. The transformed DataFrame has to be set with setOutputDf." + }, + "pythonCode": { + "type": "string", + "description": "Optional pythonCode to use for python transformation. The python code can use variables inputDf, dataObjectId and options. The transformed DataFrame has to be set with setOutputDf." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "CustomDfTransformerConfig", + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1)\nDefine a transform function which receives a DataObjectIds, a DataFrames and a map of options and has to return a\nDataFrame, see also[[CustomDfTransformer]].\n\nNote about Python transformation: Environment with Python and PySpark needed.\nPySpark session is initialize and available under variables`sc`,`session`,`sqlContext`.\nOther variables available are\n-`inputDf`: Input DataFrame\n-`options`: Transformation options as Map[String,String]\n-`dataObjectId`: Id of input dataObject as String\nOutput DataFrame must be set with`setOutputDf(df)` ." + }, + "transformers": { + "type": "array", + "items": { + "oneOf": [ + { + "$ref": "#/definitions/GenericDfTransformer/AdditionalColumnsTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/BlacklistTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/DataValidationTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/DecryptColumnsTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/EncryptColumnsTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/FilterTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/PythonCodeDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/SQLDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaClassGenericDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaClassSnowparkDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaClassSparkDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaClassSparkDsTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaCodeSparkDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaNotebookSparkDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/SparkRepartitionTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/StandardizeColNamesTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/StandardizeSparkDatatypesTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/WhitelistTransformer" + } + ] + }, + "description": "optional list of transformations to apply before historization. See[[sparktransformer]] for a list of included Transformers.\nThe transformations are applied according to the lists ordering." + }, + "filterClause": { + "type": "string", + "description": "Filter of data to be processed by historization. It can be used to exclude historical data not needed to create new history, for performance reasons.\nNote that filterClause is only applied if mergeModeEnable=false. Use mergeModeAdditionalJoinPredicate if mergeModeEnable=true to achieve a similar performance tuning." + }, + "historizeBlacklist": { + "type": "array", + "items": { + "type": "string" + }, + "description": "optional list of columns to ignore when comparing two records in historization. Can not be used together with[[historizeWhitelist]] ." + }, + "historizeWhitelist": { + "type": "array", + "items": { + "type": "string" + }, + "description": "optional final list of columns to use when comparing two records in historization. Can not be used together with[[historizeBlacklist]] ." + }, + "ignoreOldDeletedColumns": { + "type": "boolean", + "description": "if true, remove no longer existing columns in Schema Evolution" + }, + "ignoreOldDeletedNestedColumns": { + "type": "boolean", + "description": "if true, remove no longer existing columns from nested data types in Schema Evolution.\nKeeping deleted columns in complex data types has performance impact as all new data\nin the future has to be converted by a complex function." + }, + "mergeModeEnable": { + "type": "boolean", + "description": "Set to true to use saveMode.Merge for much better performance by using incremental historization.\nOutput DataObject must implement[[CanMergeDataFrame]] if enabled (default = false).\nIncremental historization will add an additional \\\"dl_hash\\\" column which is used for change detection between\nexisting and new data.\nNote that enabling mergeMode on an existing HistorizeAction will create a new version for every\nnew record in the output table, as \\\"dl_hash\\\" column is initially null." + }, + "mergeModeAdditionalJoinPredicate": { + "type": "string", + "description": "To optimize performance it might be interesting to limit the records read from the existing table data, e.g. it might be sufficient to use only the last 7 days.\nSpecify a condition to select existing data to be used in transformation as Spark SQL expression.\nUse table alias \\'existing\\' to reference columns of the existing table data." + }, + "mergeModeCDCColumn": { + "type": "string", + "description": "Optional colum holding the CDC operation to replay to enable mergeModeCDC. If CDC information is available from the source\nincremental historization can be further optimized, as the join with existing data can be omitted.\nNote that this should be enabled only, if input data contains just inserted, updated and deleted records.\nHistorizeAction in mergeModeCDC will make *no* change detection on its own, and create a new version for every inserted/updated record it receives!\nYou will also need to specify parameter mergeModeCDCDeletedValue to use this and mergeModeEnable=true.\nIncrement CDC historization will add an additional column \\\"dl_dummy\\\" to the target table,\nwhich is used to work around limitations of SQL merge statement, but \\\"dl_hash\\\" column from mergeMode is no longer needed." + }, + "mergeModeCDCDeletedValue": { + "type": "string", + "description": "Optional value of mergeModeCDCColumn that marks a record as deleted." + }, + "breakDataFrameLineage": { + "type": "boolean", + "description": "Stop propagating input DataFrame through action and instead get a new DataFrame from DataObject.\nThis can help to save memory and performance if the input DataFrame includes many transformations from previous Actions.\nThe new DataFrame will be initialized according to the SubFeed\\'s partitionValues." + }, + "persist": { + "type": "boolean", + "description": "Force persisting input DataFrame\\'s on Disk.\nThis improves performance if dataFrame is used multiple times in the transformation and can serve as a recovery point\nin case a task get\\'s lost.\nNote that DataFrames are persisted automatically by the previous Action if later Actions need the same data. To avoid this\nbehaviour set breakDataFrameLineage=false." + }, + "executionMode": { + "oneOf": [ + { + "$ref": "#/definitions/ExecutionMode/CustomMode" + }, + { + "$ref": "#/definitions/ExecutionMode/CustomPartitionMode" + }, + { + "$ref": "#/definitions/ExecutionMode/DataFrameIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/DataObjectStateIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/FailIfNoPartitionValuesMode" + }, + { + "$ref": "#/definitions/ExecutionMode/FileIncrementalMoveMode" + }, + { + "$ref": "#/definitions/ExecutionMode/KafkaStateIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/PartitionDiffMode" + }, + { + "$ref": "#/definitions/ExecutionMode/ProcessAllMode" + }, + { + "$ref": "#/definitions/ExecutionMode/SparkStreamingMode" + } + ], + "description": "optional execution mode for this Action" + }, + "executionCondition": { + "type": "object", + "properties": { + "expression": { + "type": "string", + "description": "Condition formulated as Spark SQL. The attributes available are dependent on the context." + }, + "description": { + "type": "string", + "description": "A textual description of the condition to be shown in error messages." + } + }, + "title": "Condition", + "required": [ + "expression" + ], + "additionalProperties": false, + "description": "Definition of a Spark SQL condition with description.\nThis is used for example to define failConditions of[[PartitionDiffMode]] ." + }, + "metricsFailCondition": { + "type": "string", + "description": "optional spark sql expression evaluated as where-clause against dataframe of metrics. Available columns are dataObjectId, key, value.\nIf there are any rows passing the where clause, a MetricCheckFailed exception is thrown." + }, + "metadata": { + "$ref": "#/definitions/Others/ActionMetadata" + } + }, + "title": "HistorizeAction", + "required": [ + "type", + "inputId", + "outputId" + ], + "additionalProperties": false, + "description": "This[[Action]]historizes data between an input and output DataObject using DataFrames.\nHistorization creates a technical history of data by creating valid-from/to columns.\nThe DataFrame might be transformed using SQL or DataFrame transformations. These transformations are applied before the deduplication.\n\nHistorizeAction needs a transactional table (e.g. implementation of[[TransactionalTableDataObject]]) as output with defined primary keys.\n\nNormal historization join new with all existing data, and rewrites all data in output table. This is not optimal from\na performance perspective.\nIt can be optimized if output object supports[[CanMergeDataFrame]] . In that case you can\nset mergeModeEnable=true to use incremental historization, which does not rewrite all data in output table. It still needs to\njoin new data with all existing data, but uses hash values to minimize data transfer.\nIf you have change-data-capture (CDC) information available to identify deleted records, you can set\nmergeModeCDCColumn and mergeModeCDCDeletedValue to even avoid the join between new and existing data. This is optimal from\na performance perspective." + }, + "ProxyAction": { + "type": "object", + "properties": { + "type": { + "const": "ProxyAction" + }, + "wrappedAction": { + "oneOf": [ + { + "$ref": "#/definitions/Action/CustomFileAction" + }, + { + "$ref": "#/definitions/Action/CustomScriptAction" + }, + { + "$ref": "#/definitions/Action/HistorizeAction" + } + ] + }, + "agent": { + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "const": "AzureRelayAgent" + }, + "url": { + "type": "string" + }, + "connections": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "const": "HiveTableConnection" + }, + "db": { + "type": "string", + "description": "hive db" + }, + "pathPrefix": { + "type": "string", + "description": "optional schema, authority and base path for tables directory on hadoop." + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "HiveTableConnection", + "required": [ + "db", + "type" + ], + "additionalProperties": false, + "description": "Connection information for hive tables" + }, + { + "type": "object", + "properties": { + "type": { + "const": "DeltaLakeTableConnection" + }, + "catalog": { + "type": "string", + "description": "optional catalog to be used for this connection" + }, + "db": { + "type": "string", + "description": "hive db" + }, + "pathPrefix": { + "type": "string", + "description": "schema, authority and base path for tables directory on hadoop" + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "checkDeltaLakeSparkOptions": { + "type": "boolean" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "DeltaLakeTableConnection", + "required": [ + "db", + "pathPrefix", + "type" + ], + "additionalProperties": false, + "description": "Connection information for DeltaLake tables" + }, + { + "type": "object", + "properties": { + "type": { + "const": "SFtpFileRefConnection" + }, + "host": { + "type": "string", + "description": "sftp host" + }, + "port": { + "type": "integer", + "description": "port of sftp service, default is 22" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "authentication information: for now BasicAuthMode and PublicKeyAuthMode are supported." + }, + "proxy": { + "type": "object", + "properties": { + "host": { + "type": "string", + "description": "proxy host" + }, + "port": { + "type": "integer", + "description": "proxy port" + }, + "proxyType": { + "type": "string", + "description": "Type of proxy: HTTP or SOCKS. Default is HTTP.", + "enum": [ + "DIRECT", + "HTTP", + "SOCKS" + ] + } + }, + "title": "JavaNetProxyConfig", + "required": [ + "host", + "port" + ], + "additionalProperties": false, + "description": "Proxy configuration to create java.net.Proxy instance." + }, + "ignoreHostKeyVerification": { + "type": "boolean", + "description": "do not validate host key if true, default is false" + }, + "maxParallelConnections": { + "type": "integer", + "description": "number of parallel sftp connections created by an instance of this connection" + }, + "connectionPoolMaxIdleTimeSec": { + "type": "integer", + "description": "timeout to close unused connections in the pool" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "SFtpFileRefConnection", + "required": [ + "host", + "authMode", + "type" + ], + "additionalProperties": false, + "description": "SFTP Connection information" + }, + { + "type": "object", + "properties": { + "type": { + "const": "HadoopFileConnection" + }, + "pathPrefix": { + "type": "string", + "description": "schema, authority and base path for accessing files on hadoop" + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "HadoopFileConnection", + "required": [ + "pathPrefix", + "type" + ], + "additionalProperties": false, + "description": "Connection information for files on hadoop" + }, + { + "type": "object", + "properties": { + "type": { + "const": "KafkaConnection" + }, + "brokers": { + "type": "string", + "description": "comma separated list of kafka bootstrap server incl. port, e.g. \\\"host1:9092,host2:9092:" + }, + "schemaRegistry": { + "type": "string", + "description": "url of schema registry service, e.g. \\\"https://host2\\\"" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options for the Kafka stream reader (see https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html)" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ] + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "KafkaConnection", + "required": [ + "brokers", + "type" + ], + "additionalProperties": false, + "description": "Connection information for kafka" + }, + { + "type": "object", + "properties": { + "type": { + "const": "SnowflakeConnection" + }, + "url": { + "type": "string", + "description": "snowflake connection url" + }, + "warehouse": { + "type": "string", + "description": "Snowflake namespace" + }, + "database": { + "type": "string", + "description": "Snowflake database" + }, + "role": { + "type": "string", + "description": "Snowflake role" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "optional authentication information: for now BasicAuthMode is supported." + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "SnowflakeConnection", + "required": [ + "url", + "warehouse", + "database", + "role", + "authMode", + "type" + ], + "additionalProperties": false, + "description": "Connection information for Snowflake databases.\nThe connection can be used for SnowflakeTableDataObjects\nIf multiple SnowflakeTableDataObjects share a connection, they share the same Snowpark session" + }, + { + "type": "object", + "properties": { + "type": { + "const": "JdbcTableConnection" + }, + "url": { + "type": "string", + "description": "jdbc connection url" + }, + "driver": { + "type": "string", + "description": "class name of jdbc driver" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "optional authentication information: for now BasicAuthMode is supported." + }, + "db": { + "type": "string", + "description": "jdbc database" + }, + "maxParallelConnections": { + "type": "integer", + "description": "max number of parallel jdbc connections created by an instance of this connection, default is 3\nNote that Spark manages JDBC Connections on its own. This setting only applies to JDBC connection\nused by SDL for validating metadata or pre/postSQL." + }, + "connectionPoolMaxIdleTimeSec": { + "type": "integer", + "description": "timeout to close unused connections in the pool" + }, + "connectionPoolMaxWaitTimeSec": { + "type": "integer", + "description": "timeout when waiting for connection in pool to become available. Default is 600 seconds (10 minutes)." + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + }, + "autoCommit": { + "type": "boolean", + "description": "flag to enable or disable the auto-commit behaviour. When autoCommit is enabled, each database request is executed in its own transaction.\nDefault is autoCommit = false. It is not recommended to enable autoCommit as it will deactivate any transactional behaviour.", + "deprecated": true + }, + "connectionInitSql": { + "type": "string", + "description": "SQL statement to be executed every time a new connection is created, for example to set session parameters" + } + }, + "title": "JdbcTableConnection", + "required": [ + "url", + "driver", + "type" + ], + "additionalProperties": false, + "description": "Connection information for jdbc tables.\nIf authentication is needed, user and password must be provided." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SplunkConnection" + }, + "host": { + "type": "string", + "description": "" + }, + "port": { + "type": "integer", + "description": "" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "SplunkConnection", + "required": [ + "host", + "port", + "authMode", + "type" + ], + "additionalProperties": false, + "description": "Connection information for splunk queries" + } + ] + } + } + }, + "title": "AzureRelayAgent", + "required": [ + "url", + "connections", + "type" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "const": "JettyAgent" + }, + "url": { + "type": "string" + }, + "connections": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "const": "HiveTableConnection" + }, + "db": { + "type": "string", + "description": "hive db" + }, + "pathPrefix": { + "type": "string", + "description": "optional schema, authority and base path for tables directory on hadoop." + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "HiveTableConnection", + "required": [ + "db", + "type" + ], + "additionalProperties": false, + "description": "Connection information for hive tables" + }, + { + "type": "object", + "properties": { + "type": { + "const": "DeltaLakeTableConnection" + }, + "catalog": { + "type": "string", + "description": "optional catalog to be used for this connection" + }, + "db": { + "type": "string", + "description": "hive db" + }, + "pathPrefix": { + "type": "string", + "description": "schema, authority and base path for tables directory on hadoop" + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "checkDeltaLakeSparkOptions": { + "type": "boolean" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "DeltaLakeTableConnection", + "required": [ + "db", + "pathPrefix", + "type" + ], + "additionalProperties": false, + "description": "Connection information for DeltaLake tables" + }, + { + "type": "object", + "properties": { + "type": { + "const": "SFtpFileRefConnection" + }, + "host": { + "type": "string", + "description": "sftp host" + }, + "port": { + "type": "integer", + "description": "port of sftp service, default is 22" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "authentication information: for now BasicAuthMode and PublicKeyAuthMode are supported." + }, + "proxy": { + "type": "object", + "properties": { + "host": { + "type": "string", + "description": "proxy host" + }, + "port": { + "type": "integer", + "description": "proxy port" + }, + "proxyType": { + "type": "string", + "description": "Type of proxy: HTTP or SOCKS. Default is HTTP.", + "enum": [ + "DIRECT", + "HTTP", + "SOCKS" + ] + } + }, + "title": "JavaNetProxyConfig", + "required": [ + "host", + "port" + ], + "additionalProperties": false, + "description": "Proxy configuration to create java.net.Proxy instance." + }, + "ignoreHostKeyVerification": { + "type": "boolean", + "description": "do not validate host key if true, default is false" + }, + "maxParallelConnections": { + "type": "integer", + "description": "number of parallel sftp connections created by an instance of this connection" + }, + "connectionPoolMaxIdleTimeSec": { + "type": "integer", + "description": "timeout to close unused connections in the pool" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "SFtpFileRefConnection", + "required": [ + "host", + "authMode", + "type" + ], + "additionalProperties": false, + "description": "SFTP Connection information" + }, + { + "type": "object", + "properties": { + "type": { + "const": "HadoopFileConnection" + }, + "pathPrefix": { + "type": "string", + "description": "schema, authority and base path for accessing files on hadoop" + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "HadoopFileConnection", + "required": [ + "pathPrefix", + "type" + ], + "additionalProperties": false, + "description": "Connection information for files on hadoop" + }, + { + "type": "object", + "properties": { + "type": { + "const": "KafkaConnection" + }, + "brokers": { + "type": "string", + "description": "comma separated list of kafka bootstrap server incl. port, e.g. \\\"host1:9092,host2:9092:" + }, + "schemaRegistry": { + "type": "string", + "description": "url of schema registry service, e.g. \\\"https://host2\\\"" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options for the Kafka stream reader (see https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html)" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ] + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "KafkaConnection", + "required": [ + "brokers", + "type" + ], + "additionalProperties": false, + "description": "Connection information for kafka" + }, + { + "type": "object", + "properties": { + "type": { + "const": "SnowflakeConnection" + }, + "url": { + "type": "string", + "description": "snowflake connection url" + }, + "warehouse": { + "type": "string", + "description": "Snowflake namespace" + }, + "database": { + "type": "string", + "description": "Snowflake database" + }, + "role": { + "type": "string", + "description": "Snowflake role" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "optional authentication information: for now BasicAuthMode is supported." + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "SnowflakeConnection", + "required": [ + "url", + "warehouse", + "database", + "role", + "authMode", + "type" + ], + "additionalProperties": false, + "description": "Connection information for Snowflake databases.\nThe connection can be used for SnowflakeTableDataObjects\nIf multiple SnowflakeTableDataObjects share a connection, they share the same Snowpark session" + }, + { + "type": "object", + "properties": { + "type": { + "const": "JdbcTableConnection" + }, + "url": { + "type": "string", + "description": "jdbc connection url" + }, + "driver": { + "type": "string", + "description": "class name of jdbc driver" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "optional authentication information: for now BasicAuthMode is supported." + }, + "db": { + "type": "string", + "description": "jdbc database" + }, + "maxParallelConnections": { + "type": "integer", + "description": "max number of parallel jdbc connections created by an instance of this connection, default is 3\nNote that Spark manages JDBC Connections on its own. This setting only applies to JDBC connection\nused by SDL for validating metadata or pre/postSQL." + }, + "connectionPoolMaxIdleTimeSec": { + "type": "integer", + "description": "timeout to close unused connections in the pool" + }, + "connectionPoolMaxWaitTimeSec": { + "type": "integer", + "description": "timeout when waiting for connection in pool to become available. Default is 600 seconds (10 minutes)." + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + }, + "autoCommit": { + "type": "boolean", + "description": "flag to enable or disable the auto-commit behaviour. When autoCommit is enabled, each database request is executed in its own transaction.\nDefault is autoCommit = false. It is not recommended to enable autoCommit as it will deactivate any transactional behaviour.", + "deprecated": true + }, + "connectionInitSql": { + "type": "string", + "description": "SQL statement to be executed every time a new connection is created, for example to set session parameters" + } + }, + "title": "JdbcTableConnection", + "required": [ + "url", + "driver", + "type" + ], + "additionalProperties": false, + "description": "Connection information for jdbc tables.\nIf authentication is needed, user and password must be provided." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SplunkConnection" + }, + "host": { + "type": "string", + "description": "" + }, + "port": { + "type": "integer", + "description": "" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "SplunkConnection", + "required": [ + "host", + "port", + "authMode", + "type" + ], + "additionalProperties": false, + "description": "Connection information for splunk queries" + } + ] + } + } + }, + "title": "JettyAgent", + "required": [ + "url", + "connections", + "type" + ], + "additionalProperties": false + } + ] + } + }, + "title": "ProxyAction", + "required": [ + "type", + "wrappedAction", + "agent" + ], + "additionalProperties": false, + "description": "Allows to execute the action defined by\n\nOTHERTAG: on a remote agent defined by\n\nOTHERTAG: .\nIf the execution of\n\nOTHERTAG: is successful, the ProxyAction will return an empty SparkSubFeed by the correct schema." + } + } + }, + "properties": { + "global": { + "type": "object", + "properties": { + "kryoClasses": { + "type": "array", + "items": { + "type": "string" + }, + "description": "classes to register for spark kryo serialization" + }, + "sparkOptions": { + "type": "object", + "additionalProperties": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "description": "spark options\n\nThe value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "statusInfo": { + "type": "object", + "properties": { + "port": { + "type": "integer", + "description": ": port with which the first connection attempt is made" + }, + "maxPortRetries": { + "type": "integer", + "description": ": If port is already in use, we will increment port by one and try with that new port.\nmaxPortRetries describes how many times this should be attempted. If set to 0 it will not be attempted.\nValues below 0 are not allowed." + }, + "stopOnEnd": { + "type": "boolean", + "description": ": Set to false if the Server should remain online even after SDL has finished its execution.\nIn that case, the Application needs to be stopped manually. Useful for debugging." + } + }, + "title": "StatusInfoConfig", + "additionalProperties": false, + "description": "Configuration for the Server that provides live status info of the current DAG Execution" + }, + "enableHive": { + "type": "boolean", + "description": "enable hive for spark session" + }, + "memoryLogTimer": { + "type": "object", + "properties": { + "intervalSec": { + "type": "integer", + "description": "interval in seconds between memory usage logs" + }, + "logLinuxMem": { + "type": "boolean", + "description": "enable logging linux memory" + }, + "logLinuxCGroupMem": { + "type": "boolean", + "description": "enable logging details about linux cgroup memory" + }, + "logBuffers": { + "type": "boolean", + "description": "enable logging details about different jvm buffers" + } + }, + "title": "MemoryLogTimerConfig", + "required": [ + "intervalSec" + ], + "additionalProperties": false, + "description": "Configuration for periodic memory usage logging" + }, + "shutdownHookLogger": { + "type": "boolean", + "description": "enable shutdown hook logger to trace shutdown cause" + }, + "stateListeners": { + "type": "array", + "items": { + "type": "object", + "properties": { + "className": { + "type": "string", + "description": "fully qualified class name of class implementing StateListener interface. The class needs a constructor with one parameter`options: Map[String,String]` ." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "description": "Options are passed to StateListener constructor.\n\nThe value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + } + }, + "title": "StateListenerConfig", + "required": [ + "className" + ], + "additionalProperties": false, + "description": "Configuration to notify interested parties about action results & metric" + }, + "description": "Define state listeners to be registered for receiving events of the execution of SmartDataLake job" + }, + "sparkUDFs": { + "type": "object", + "additionalProperties": { + "type": "object", + "properties": { + "className": { + "type": "string", + "description": "fully qualified class name of class implementing SparkUDFCreator interface. The class needs a constructor without parameters." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options are passed to SparkUDFCreator apply method." + } + }, + "title": "SparkUDFCreatorConfig", + "required": [ + "className" + ], + "additionalProperties": false, + "description": "Configuration to register a UserDefinedFunction in the spark session of SmartDataLake." + }, + "description": "Define UDFs to be registered in spark session. The registered UDFs are available in Spark SQL transformations\nand expression evaluation, e.g. configuration of ExecutionModes." + }, + "pythonUDFs": { + "type": "object", + "additionalProperties": { + "type": "object", + "properties": { + "pythonFile": { + "type": "string", + "description": "Optional pythonFile to use for python UDF." + }, + "pythonCode": { + "type": "string", + "description": "Optional pythonCode to use for python UDF." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options are available in your python code as variable options." + } + }, + "title": "PythonUDFCreatorConfig", + "additionalProperties": false, + "description": "Configuration to register a Python UDF in the spark session of SmartDataLake.\nDefine a python function with type hints i python code and register it in global configuration.\nThe name of the function must match the name you use to declare it in GlobalConf.\nThe Python function can then be used in Spark SQL expressions." + }, + "description": "Define UDFs in python to be registered in spark session. The registered UDFs are available in Spark SQL transformations\nbut not for expression evaluation." + }, + "secretProviders": { + "type": "object", + "additionalProperties": { + "type": "object", + "properties": { + "className": { + "type": "string", + "description": "fully qualified class name of class implementing SecretProvider interface. The class needs a constructor with parameter \\\"options: Map[String,String]\\\"." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options are passed to SecretProvider apply method." + } + }, + "title": "SecretProviderConfig", + "required": [ + "className" + ], + "additionalProperties": false, + "description": "Configuration to register a SecretProvider." + }, + "description": "Define SecretProvider\\'s to be registered." + }, + "allowOverwriteAllPartitionsWithoutPartitionValues": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Configure a list of exceptions for partitioned DataObject id\\'s,\nwhich are allowed to overwrite the all partitions of a table if no partition values are set.\nThis is used to override/avoid a protective error when using SDLSaveMode.OverwriteOptimized|OverwritePreserveDirectories.\nDefine it as a list of DataObject id\\'s." + }, + "allowAsRecursiveInput": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of DataObjects for which the validation rules for Action.recursiveInputIds are *not* checked.\nThe validation rules are\n1) that recursive input DataObjects must also be listed in output DataObjects of the same action\n2) the DataObject must implement TransactionalSparkTableDataObject interface\nListing a DataObject in allowAsRecursiveInput can be used for well thought exceptions, but should be avoided in general.\nNote that if 1) is true, also 2) must be fullfilled for Spark to work properly (because Spark can\\'t read/write the same storage location in the same job),\nbut there might be cases with recursions with different Actions involved, that dont need to fullfill 2)." + }, + "synchronousStreamingTriggerIntervalSec": { + "type": "integer", + "description": "Trigger interval for synchronous actions in streaming mode in seconds (default = 60 seconds)\nThe synchronous actions of the DAG will be executed with this interval if possile.\nNote that for asynchronous actions there are separate settings, e.g. SparkStreamingMode.triggerInterval." + }, + "environment": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Override environment settings defined in Environment object by setting the corresponding key to the desired value (key in camelcase notation with the first letter in lowercase)" + }, + "pluginOptions": { + "type": "object", + "additionalProperties": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + } + }, + "title": "GlobalConfig", + "additionalProperties": false, + "description": "Global configuration options\n\nNote that global configuration is responsible to hold SparkSession, so that its created once and only once per SDLB job.\nThis is especially important if JVM is shared between different SDL jobs (e.g. Databricks cluster), because sharing SparkSession in object Environment survives the current SDLB job." + }, + "connections": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "$ref": "#/definitions/Connection/DeltaLakeTableConnection" + }, + { + "$ref": "#/definitions/Connection/HadoopFileConnection" + }, + { + "$ref": "#/definitions/Connection/HiveTableConnection" + }, + { + "$ref": "#/definitions/Connection/JdbcTableConnection" + }, + { + "$ref": "#/definitions/Connection/KafkaConnection" + }, + { + "$ref": "#/definitions/Connection/SFtpFileRefConnection" + }, + { + "$ref": "#/definitions/Connection/SnowflakeConnection" + }, + { + "$ref": "#/definitions/Connection/SplunkConnection" + } + ], + "description": "Map Connection name : definition" + } + }, + "dataObjects": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "$ref": "#/definitions/DataObject/AccessTableDataObject" + }, + { + "$ref": "#/definitions/DataObject/ActionsExporterDataObject" + }, + { + "$ref": "#/definitions/DataObject/AirbyteDataObject" + }, + { + "$ref": "#/definitions/DataObject/AvroFileDataObject" + }, + { + "$ref": "#/definitions/DataObject/CsvFileDataObject" + }, + { + "$ref": "#/definitions/DataObject/CustomDfDataObject" + }, + { + "$ref": "#/definitions/DataObject/CustomFileDataObject" + }, + { + "$ref": "#/definitions/DataObject/DataObjectsExporterDataObject" + }, + { + "$ref": "#/definitions/DataObject/DeltaLakeTableDataObject" + }, + { + "$ref": "#/definitions/DataObject/ExcelFileDataObject" + }, + { + "$ref": "#/definitions/DataObject/HiveTableDataObject" + }, + { + "$ref": "#/definitions/DataObject/JdbcTableDataObject" + }, + { + "$ref": "#/definitions/DataObject/JmsDataObject" + }, + { + "$ref": "#/definitions/DataObject/JsonFileDataObject" + }, + { + "$ref": "#/definitions/DataObject/KafkaTopicDataObject" + }, + { + "$ref": "#/definitions/DataObject/PKViolatorsDataObject" + }, + { + "$ref": "#/definitions/DataObject/ParquetFileDataObject" + }, + { + "$ref": "#/definitions/DataObject/RawFileDataObject" + }, + { + "$ref": "#/definitions/DataObject/RelaxedCsvFileDataObject" + }, + { + "$ref": "#/definitions/DataObject/SFtpFileRefDataObject" + }, + { + "$ref": "#/definitions/DataObject/SnowflakeTableDataObject" + }, + { + "$ref": "#/definitions/DataObject/SplunkDataObject" + }, + { + "$ref": "#/definitions/DataObject/TickTockHiveTableDataObject" + }, + { + "$ref": "#/definitions/DataObject/WebserviceFileDataObject" + }, + { + "$ref": "#/definitions/DataObject/XmlFileDataObject" + } + ], + "description": "Map of DataObject name and definition" + } + }, + "actions": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "$ref": "#/definitions/Action/CopyAction" + }, + { + "$ref": "#/definitions/Action/CustomDataFrameAction" + }, + { + "$ref": "#/definitions/Action/CustomFileAction" + }, + { + "$ref": "#/definitions/Action/CustomScriptAction" + }, + { + "$ref": "#/definitions/Action/DeduplicateAction" + }, + { + "$ref": "#/definitions/Action/FileTransferAction" + }, + { + "$ref": "#/definitions/Action/HistorizeAction" + }, + { + "$ref": "#/definitions/Action/ProxyAction" + } + ], + "description": "Map of Action name and definition" + } + } + }, + "required": [ + "dataObjects", + "actions" + ], + "additionalProperties": true +} \ No newline at end of file diff --git a/src/main/scala/io/smartdatalake/completion/SDLBCompletionEngine.scala b/src/main/scala/io/smartdatalake/completion/SDLBCompletionEngine.scala new file mode 100644 index 0000000..18d42eb --- /dev/null +++ b/src/main/scala/io/smartdatalake/completion/SDLBCompletionEngine.scala @@ -0,0 +1,7 @@ +package io.smartdatalake.completion + +import io.smartdatalake.context.SDLBContext +import org.eclipse.lsp4j.CompletionItem + +trait SDLBCompletionEngine: + def generateCompletionItems(context: SDLBContext): List[CompletionItem] diff --git a/src/main/scala/io/smartdatalake/completion/SDLBCompletionEngineImpl.scala b/src/main/scala/io/smartdatalake/completion/SDLBCompletionEngineImpl.scala new file mode 100644 index 0000000..fab2b15 --- /dev/null +++ b/src/main/scala/io/smartdatalake/completion/SDLBCompletionEngineImpl.scala @@ -0,0 +1,37 @@ +package io.smartdatalake.completion + +import io.smartdatalake.completion.SDLBCompletionEngine +import io.smartdatalake.completion.schema.{ItemType, SchemaItem, SchemaReader, SchemaReaderImpl} +import io.smartdatalake.context.SDLBContext +import org.eclipse.lsp4j.{CompletionItem, CompletionItemKind} + +import scala.util.{Failure, Success, Try} + +class SDLBCompletionEngineImpl extends SDLBCompletionEngine { + + val schemaReader: SchemaReader = new SchemaReaderImpl("sdl-schema/sdl-schema-2.5.0.json") //TODO should be retrieved from a service keeping its state, object for example + + override def generateCompletionItems(context: SDLBContext): List[CompletionItem] = context.parentPath match + case path if path.startsWith("actions") && path.count(_ == '.') == 1 => generatePropertiesOfAction(context) + case path if path.startsWith("actions") && !path.contains('.') => List.empty[CompletionItem] //TODO discuss about this placeholder idea + case path if path.startsWith("actions") => List.empty[CompletionItem] //TODO when going deeper find a good recursive approach and mb merge it with first case + case _ => List.empty[CompletionItem] + + + private def generatePropertiesOfAction(context: SDLBContext): List[CompletionItem] = + val tActionType: Try[String] = Try(context.config.getString(context.parentPath + ".type")) + tActionType match + case Success(actionType) => schemaReader.retrieveActionProperties(actionType).map(createCompletionItem).toList + case Failure(_) => typeList + + private def createCompletionItem(item: SchemaItem): CompletionItem = + val completionItem = new CompletionItem() + completionItem.setLabel(item.name) + completionItem.setDetail(item.description) + completionItem.setInsertText(item.name + (if item.itemType.isComplexValue then " " else " = ")) + completionItem.setKind(CompletionItemKind.Snippet) + completionItem + + private val typeItem = createCompletionItem(SchemaItem("type", ItemType.STRING, " type of object")) + private val typeList = List(typeItem) +} diff --git a/src/main/scala/io/smartdatalake/completion/schema/ItemType.scala b/src/main/scala/io/smartdatalake/completion/schema/ItemType.scala new file mode 100644 index 0000000..42da6fb --- /dev/null +++ b/src/main/scala/io/smartdatalake/completion/schema/ItemType.scala @@ -0,0 +1,22 @@ +package io.smartdatalake.completion.schema + +enum ItemType(val name: String) { + case STRING extends ItemType("string") + case BOOLEAN extends ItemType("boolean") + case INTEGER extends ItemType("integer") + case OBJECT extends ItemType("object") + case ARRAY extends ItemType("array") + + def isPrimitiveValue: Boolean = this == ItemType.STRING || this == ItemType.BOOLEAN || this == ItemType.INTEGER + + def isComplexValue: Boolean = this == ItemType.OBJECT || this == ItemType.ARRAY + +} + +object ItemType: + def fromName(name: String): ItemType = name match + case "string" => ItemType.STRING + case "boolean" => ItemType.BOOLEAN + case "integer" => ItemType.INTEGER + case "object" => ItemType.OBJECT + case "array" => ItemType.ARRAY \ No newline at end of file diff --git a/src/main/scala/io/smartdatalake/completion/schema/SchemaItem.scala b/src/main/scala/io/smartdatalake/completion/schema/SchemaItem.scala new file mode 100644 index 0000000..063fe07 --- /dev/null +++ b/src/main/scala/io/smartdatalake/completion/schema/SchemaItem.scala @@ -0,0 +1,3 @@ +package io.smartdatalake.completion.schema + +case class SchemaItem(name: String, itemType: ItemType, description: String) //TODO title as well? diff --git a/src/main/scala/io/smartdatalake/completion/schema/SchemaReader.scala b/src/main/scala/io/smartdatalake/completion/schema/SchemaReader.scala new file mode 100644 index 0000000..eed5d6b --- /dev/null +++ b/src/main/scala/io/smartdatalake/completion/schema/SchemaReader.scala @@ -0,0 +1,4 @@ +package io.smartdatalake.completion.schema + +trait SchemaReader: + def retrieveActionProperties(typeName: String): Iterable[SchemaItem] diff --git a/src/main/scala/io/smartdatalake/completion/schema/SchemaReaderImpl.scala b/src/main/scala/io/smartdatalake/completion/schema/SchemaReaderImpl.scala new file mode 100644 index 0000000..b49489d --- /dev/null +++ b/src/main/scala/io/smartdatalake/completion/schema/SchemaReaderImpl.scala @@ -0,0 +1,22 @@ +package io.smartdatalake.completion.schema + +import scala.io.Source +import scala.util.Using + +class SchemaReaderImpl(val schemaPath: String) extends SchemaReader { + + private val schema = ujson.read(Using.resource(getClass.getClassLoader.getResourceAsStream(schemaPath)) { inputStream => + Source.fromInputStream(inputStream).getLines().mkString("\n").trim + }) + + override def retrieveActionProperties(typeName: String): Iterable[SchemaItem] = + val properties = schema("definitions")("Action")(typeName)("properties") + + properties.obj.map { case (keyName, value) => + val typeName = value.obj.get("type").map(_.str).getOrElse("string") + val description = value.obj.get("description").map(_.str).getOrElse("") + SchemaItem(keyName, ItemType.fromName(typeName), description) + } + + +} diff --git a/src/main/scala/io/smartdatalake/context/SDLBContext.scala b/src/main/scala/io/smartdatalake/context/SDLBContext.scala index def9844..8d35aad 100644 --- a/src/main/scala/io/smartdatalake/context/SDLBContext.scala +++ b/src/main/scala/io/smartdatalake/context/SDLBContext.scala @@ -19,7 +19,7 @@ class SDLBContext private (val text: String, val config: Config, val parentPath: * get context of the parent * @return either a SimpleConfigObject if parent is a key or a ConfigString, ConfigList, ConfigBoolean etc if it is an end value */ - def getParentContext: Option[ConfigValue] = if parentPath.isBlank then None else Some(config.getValue(parentPath)) + def getParentContext: Option[ConfigValue] = if parentPath.isBlank then None else Some(config.getValue(parentPath)) } diff --git a/src/main/scala/io/smartdatalake/context/hocon/HoconParser.scala b/src/main/scala/io/smartdatalake/context/hocon/HoconParser.scala index dc7de29..dfe0d24 100644 --- a/src/main/scala/io/smartdatalake/context/hocon/HoconParser.scala +++ b/src/main/scala/io/smartdatalake/context/hocon/HoconParser.scala @@ -20,9 +20,7 @@ private[context] object HoconParser: * @return parsed text in config format */ def parse(text: String): Option[Config] = - Try(ConfigFactory.parseString(text)) match - case Success(config) => Some(config) - case Failure(_) => None + Try(ConfigFactory.parseString(text)).toOption val EMPTY_CONFIG: Config = ConfigFactory.parseString("") diff --git a/src/main/scala/io/smartdatalake/languageserver/SmartDataLakeTextDocumentService.scala b/src/main/scala/io/smartdatalake/languageserver/SmartDataLakeTextDocumentService.scala index 394b6a4..4eade00 100644 --- a/src/main/scala/io/smartdatalake/languageserver/SmartDataLakeTextDocumentService.scala +++ b/src/main/scala/io/smartdatalake/languageserver/SmartDataLakeTextDocumentService.scala @@ -1,11 +1,15 @@ package io.smartdatalake.languageserver +import io.smartdatalake.completion.SDLBCompletionEngineImpl +import io.smartdatalake.context.SDLBContext import org.eclipse.lsp4j.jsonrpc.messages import org.eclipse.lsp4j.services.TextDocumentService import org.eclipse.lsp4j.{CodeAction, CodeActionParams, CodeLens, CodeLensParams, Command, CompletionItem, CompletionItemKind, CompletionList, CompletionParams, DefinitionParams, DidChangeTextDocumentParams, DidCloseTextDocumentParams, DidOpenTextDocumentParams, DidSaveTextDocumentParams, DocumentFormattingParams, DocumentHighlight, DocumentHighlightParams, DocumentOnTypeFormattingParams, DocumentRangeFormattingParams, DocumentSymbol, DocumentSymbolParams, Hover, HoverParams, InsertReplaceEdit, Location, LocationLink, Position, Range, ReferenceParams, RenameParams, SignatureHelp, SignatureHelpParams, SymbolInformation, TextDocumentPositionParams, TextEdit, WorkspaceEdit} import java.util import java.util.concurrent.CompletableFuture +import scala.io.Source +import scala.util.Using class SmartDataLakeTextDocumentService extends TextDocumentService { @@ -14,31 +18,45 @@ class SmartDataLakeTextDocumentService extends TextDocumentService { CompletableFuture.supplyAsync(() => { val completionItems = new util.ArrayList[CompletionItem]() - if (params.getPosition.getLine == 1) { - val completionItem = new CompletionItem() - completionItem.setInsertText("dataObjects {\n\t\n}\n\nactions {\n\t\n}") - completionItem.setLabel("gen") - completionItem.setKind(CompletionItemKind.Snippet) - completionItem.setDetail("Generate basic template") - - completionItems.add(completionItem) - } else { - // Sample Completion item for dataObject - val completionItem = new CompletionItem() - // Define the text to be inserted in to the file if the completion item is selected. - completionItem.setInsertText("dataObjects") - // Set the label that shows when the completion drop down appears in the Editor. - completionItem.setLabel("dataObjects") - // Set the completion kind. This is a snippet. - // That means it replace character which trigger the completion and - // replace it with what defined in inserted text. - completionItem.setKind(CompletionItemKind.Snippet) - // This will set the details for the snippet code which will help user to - // understand what this completion item is. - completionItem.setDetail(" {...}\n Defines the data objects") - // Add the sample completion item to the list. - completionItems.add(completionItem) - } + val fixtureText = //TODO weird behavior with \"\"\" + """actions { + | + | join-departures-airports { + | type = CustomDataFrameAction + | + | inputIds = [stg-departures, int-airports] + | transformer = { + | type = SQLDfsTransformer + | code = { + | btl-connected-airports = "select stg_departures.estdepartureairport, stg_departures.estarrivalairport, airports.* from stg_departures join int_airports airports on stg_departures.estArrivalAirport = airports.ident" + | } + | } + | } + | + | compute-distances { + | type = CopyAction + | + | code = { + | btl-departures-arrivals-airports = "select btl_connected_airports.estdepartureairport, btl_connected_airports.estarrivalairport, btl_connected_airports.name as arr_name, btl_connected_airports.latitude_deg as arr_latitude_deg, btl_connected_airports.longitude_deg as arr_longitude_deg, airports.name as dep_name, airports.latitude_deg as dep_latitude_deg, airports.longitude_deg as dep_longitude_deg from btl_connected_airports join int_airports airports on btl_connected_airports.estdepartureairport = airports.ident" + | } + | metadata { + | feed = compute + | } + | } + | + | download-airports { + | + | inputId = ext-airports + | } + | + |} + | + |dataObjects { + | + | + |}""".stripMargin.trim + val suggestions: List[CompletionItem] = new SDLBCompletionEngineImpl().generateCompletionItems(SDLBContext.createContext(fixtureText, params.getPosition.getLine+1, params.getPosition.getCharacter)) + suggestions.foreach(e => completionItems.add(e)) messages.Either.forLeft(completionItems).asInstanceOf[messages.Either[util.List[CompletionItem], CompletionList]] }) diff --git a/src/main/scala/io/smartdatalake/utils/MultiLineTransformer.scala b/src/main/scala/io/smartdatalake/utils/MultiLineTransformer.scala index 2e04917..65b36aa 100644 --- a/src/main/scala/io/smartdatalake/utils/MultiLineTransformer.scala +++ b/src/main/scala/io/smartdatalake/utils/MultiLineTransformer.scala @@ -15,11 +15,15 @@ object MultiLineTransformer { def computeCorrectedPositions(text: String): List[(Int, Int)] = + + def isMultilineModeStartingOrEnding(line: String): Boolean = + // handle specific case where the starting """ and ending """ are in the same line or not. + line.count(_ == '"') % 2 == 1 case class State(isInMultiLine: Boolean, lineNumber: Int, columnShift: Int) text.split("\n") .foldLeft(List(State(false, 1, 0))) {(states, line) => val lastState = states.head - val isInTripleQuotes = lastState.isInMultiLine ^ line.contains("\"\"\"") + val isInTripleQuotes = lastState.isInMultiLine ^ isMultilineModeStartingOrEnding(line) if isInTripleQuotes then State(isInTripleQuotes, lastState.lineNumber, lastState.columnShift + line.length)::states else diff --git a/src/test/resources/fixture/airport-example.conf b/src/test/resources/fixture/hocon/airport-example.conf similarity index 100% rename from src/test/resources/fixture/airport-example.conf rename to src/test/resources/fixture/hocon/airport-example.conf diff --git a/src/test/resources/fixture/basic-example.conf b/src/test/resources/fixture/hocon/basic-example.conf similarity index 100% rename from src/test/resources/fixture/basic-example.conf rename to src/test/resources/fixture/hocon/basic-example.conf diff --git a/src/test/resources/fixture/with-comments-example.conf b/src/test/resources/fixture/hocon/with-comments-example.conf similarity index 100% rename from src/test/resources/fixture/with-comments-example.conf rename to src/test/resources/fixture/hocon/with-comments-example.conf diff --git a/src/test/resources/fixture/with-lists-example.conf b/src/test/resources/fixture/hocon/with-lists-example.conf similarity index 100% rename from src/test/resources/fixture/with-lists-example.conf rename to src/test/resources/fixture/hocon/with-lists-example.conf diff --git a/src/test/resources/fixture/with-multi-lines-example.conf b/src/test/resources/fixture/hocon/with-multi-lines-example.conf similarity index 100% rename from src/test/resources/fixture/with-multi-lines-example.conf rename to src/test/resources/fixture/hocon/with-multi-lines-example.conf diff --git a/src/test/resources/fixture/with-multi-lines-flattened-example.conf b/src/test/resources/fixture/hocon/with-multi-lines-flattened-example.conf similarity index 100% rename from src/test/resources/fixture/with-multi-lines-flattened-example.conf rename to src/test/resources/fixture/hocon/with-multi-lines-flattened-example.conf diff --git a/src/test/resources/fixture/sdl-schema/sdl-schema-2.5.0.json b/src/test/resources/fixture/sdl-schema/sdl-schema-2.5.0.json new file mode 100644 index 0000000..ebcf95c --- /dev/null +++ b/src/test/resources/fixture/sdl-schema/sdl-schema-2.5.0.json @@ -0,0 +1,10111 @@ +{ + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "version": "2.5.0", + "id": "sdl-schema-2.5.0.json#", + "definitions": { + "ExecutionMode": { + "CustomMode": { + "type": "object", + "properties": { + "type": { + "const": "CustomMode" + }, + "className": { + "type": "string", + "description": "class name implementing trait[[CustomModeLogic]]" + }, + "alternativeOutputId": { + "type": "string", + "description": "optional alternative outputId of DataObject later in the DAG. This replaces the mainOutputId.\nIt can be used to ensure processing over multiple actions in case of errors." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options specified in the configuration for this execution mode" + } + }, + "title": "CustomMode", + "required": [ + "type", + "className" + ], + "additionalProperties": false, + "description": "Execution mode to create custom execution mode logic.\nDefine a function which receives main input&output DataObject and returns execution mode result" + }, + "CustomPartitionMode": { + "type": "object", + "properties": { + "type": { + "const": "CustomPartitionMode" + }, + "className": { + "type": "string", + "description": "class name implementing trait[[CustomPartitionModeLogic]]" + }, + "alternativeOutputId": { + "type": "string", + "description": "optional alternative outputId of DataObject later in the DAG. This replaces the mainOutputId.\nIt can be used to ensure processing all partitions over multiple actions in case of errors." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options specified in the configuration for this execution mode" + } + }, + "title": "CustomPartitionMode", + "required": [ + "type", + "className" + ], + "additionalProperties": false, + "description": "Execution mode to create custom partition execution mode logic.\n\nDefine a function which receives main input&output DataObject and returns partition values to process as`Seq[Map[String,String]]`" + }, + "DataFrameIncrementalMode": { + "type": "object", + "properties": { + "type": { + "const": "DataFrameIncrementalMode" + }, + "compareCol": { + "type": "string", + "description": "a comparable column name existing in mainInput and mainOutput used to identify the delta. Column content should be bigger for newer records." + }, + "alternativeOutputId": { + "type": "string", + "description": "optional alternative outputId of DataObject later in the DAG. This replaces the mainOutputId.\nIt can be used to ensure processing all partitions over multiple actions in case of errors." + }, + "applyCondition": { + "type": "object", + "properties": { + "expression": { + "type": "string", + "description": "Condition formulated as Spark SQL. The attributes available are dependent on the context." + }, + "description": { + "type": "string", + "description": "A textual description of the condition to be shown in error messages." + } + }, + "title": "Condition", + "required": [ + "expression" + ], + "additionalProperties": false, + "description": "Definition of a Spark SQL condition with description.\nThis is used for example to define failConditions of[[PartitionDiffMode]] ." + } + }, + "title": "DataFrameIncrementalMode", + "required": [ + "type", + "compareCol" + ], + "additionalProperties": false, + "description": "Compares max entry in \\\"compare column\\\" between mainOutput and mainInput and incrementally loads the delta.\nThis mode works only with SparkSubFeeds. The filter is not propagated to following actions." + }, + "DataObjectStateIncrementalMode": { + "type": "object", + "properties": { + "type": { + "const": "DataObjectStateIncrementalMode" + } + }, + "title": "DataObjectStateIncrementalMode", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "An execution mode for incremental processing by remembering DataObjects state from last increment." + }, + "FailIfNoPartitionValuesMode": { + "type": "object", + "properties": { + "type": { + "const": "FailIfNoPartitionValuesMode" + } + }, + "title": "FailIfNoPartitionValuesMode", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "An execution mode which just validates that partition values are given.\nNote: For start nodes of the DAG partition values can be defined by command line, for subsequent nodes partition values are passed on from previous nodes." + }, + "FileIncrementalMoveMode": { + "type": "object", + "properties": { + "type": { + "const": "FileIncrementalMoveMode" + }, + "archivePath": { + "type": "string", + "description": "if an archive directory is configured, files are moved into that directory instead of deleted, preserving partition layout.\nIf this is a relative path, e.g. \\\"_archive\\\", it is appended after the path of the DataObject.\nIf this is an absolute path it replaces the path of the DataObject." + } + }, + "title": "FileIncrementalMoveMode", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Execution mode to incrementally process file-based DataObjects, e.g. FileRefDataObjects and SparkFileDataObjects.\nFor FileRefDataObjects:\n- All existing files in the input DataObject are processed and removed (deleted or archived) after processing\n- Input partition values are applied to search for files and also used as output partition values\nFor SparkFileDataObjects:\n- Files processed are read from the DataFrames execution plan and removed (deleted or archived) after processing.\nNote that is only correct if no additional filters are applied in the DataFrame.\nA better implementation would be to observe files by a custom metric. Unfortunately there is a problem in Spark with that, see also[[CollectSetDeterministic]] \n- Partition values preserved." + }, + "KafkaStateIncrementalMode": { + "type": "object", + "properties": { + "type": { + "const": "KafkaStateIncrementalMode" + }, + "delayedMaxTimestampExpr": { + "type": "string", + "description": "Optional expression to define a delay to read latest offsets from Kafka. The expression has to return a timestamp which is used to select ending offsets to read from Kafka.\nDefine a spark sql expression working with the attributes of[[DefaultExpressionData]] returning a timestamp.\nDefault is to read latest offsets existing in Kafka." + } + }, + "title": "KafkaStateIncrementalMode", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "A special incremental execution mode for Kafka Inputs, remembering the state from the last increment through the Kafka Consumer, e.g. committed offsets." + }, + "PartitionDiffMode": { + "type": "object", + "properties": { + "type": { + "const": "PartitionDiffMode" + }, + "partitionColNb": { + "type": "integer", + "description": "optional number of partition columns to use as a common \\'init\\'." + }, + "alternativeOutputId": { + "type": "string", + "description": "optional alternative outputId of DataObject later in the DAG. This replaces the mainOutputId.\nIt can be used to ensure processing all partitions over multiple actions in case of errors." + }, + "nbOfPartitionValuesPerRun": { + "type": "integer", + "description": "optional restriction of the number of partition values per run." + }, + "applyCondition": { + "type": "string", + "description": "Condition to decide if execution mode should be applied or not. Define a spark sql expression working with attributes of[[DefaultExecutionModeExpressionData]] returning a boolean.\nDefault is to apply the execution mode if given partition values (partition values from command line or passed from previous action) are empty." + }, + "failCondition": { + "type": "string" + }, + "failConditions": { + "type": "array", + "items": { + "type": "object", + "properties": { + "expression": { + "type": "string", + "description": "Condition formulated as Spark SQL. The attributes available are dependent on the context." + }, + "description": { + "type": "string", + "description": "A textual description of the condition to be shown in error messages." + } + }, + "title": "Condition", + "required": [ + "expression" + ], + "additionalProperties": false, + "description": "Definition of a Spark SQL condition with description.\nThis is used for example to define failConditions of[[PartitionDiffMode]] ." + }, + "description": "List of conditions to fail application of execution mode if true. Define as spark sql expressions working with attributes of[[PartitionDiffModeExpressionData]] returning a boolean.\nDefault is that the application of the PartitionDiffMode does not fail the action. If there is no data to process, the following actions are skipped.\nMultiple conditions are evaluated individually and every condition may fail the execution mode (or-logic)" + }, + "selectExpression": { + "type": "string", + "description": "optional expression to define or refine the list of selected output partitions. Define a spark sql expression working with the attributes of[[PartitionDiffModeExpressionData]] returning a list>.\nDefault is to return the originally selected output partitions found in attribute selectedOutputPartitionValues." + }, + "applyPartitionValuesTransform": { + "type": "boolean", + "description": "If true applies the partition values transform of custom transformations on input partition values before comparison with output partition values.\nIf enabled input and output partition columns can be different. Default is to disable the transformation of partition values." + }, + "selectAdditionalInputExpression": { + "type": "string", + "description": "optional expression to refine the list of selected input partitions. Note that primarily output partitions are selected by PartitionDiffMode.\nThe selected output partitions are then transformed back to the input partitions needed to create the selected output partitions. This is one-to-one except if applyPartitionValuesTransform=true.\nAnd sometimes there is a need for additional input data to create the output partitions, e.g. if you aggregate a window of 7 days for every day.\nYou can customize selected input partitions by defining a spark sql expression working with the attributes of[[PartitionDiffModeExpressionData]] returning a list>.\nDefault is to return the originally selected input partitions found in attribute selectedInputPartitionValues." + } + }, + "title": "PartitionDiffMode", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Partition difference execution mode lists partitions on mainInput & mainOutput DataObject and starts loading all missing partitions.\nPartition columns to be used for comparision need to be a common \\'init\\' of input and output partition columns.\nThis mode needs mainInput/Output DataObjects which CanHandlePartitions to list partitions.\nPartition values are passed to following actions for partition columns which they have in common." + }, + "ProcessAllMode": { + "type": "object", + "properties": { + "type": { + "const": "ProcessAllMode" + } + }, + "title": "ProcessAllMode", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "An execution mode which forces processing all data from it\\'s inputs." + }, + "SparkStreamingMode": { + "type": "object", + "properties": { + "type": { + "const": "SparkStreamingMode" + }, + "checkpointLocation": { + "type": "string", + "description": "location for checkpoints of streaming query to keep state" + }, + "triggerType": { + "type": "string", + "description": "define execution interval of Spark streaming query. Possible values are Once (default), ProcessingTime & Continuous. See[[Trigger]] for details.\nNote that this is only applied if SDL is executed in streaming mode. If SDL is executed in normal mode, TriggerType=Once is used always.\nIf triggerType=Once, the action is repeated with Trigger.Once in SDL streaming mode." + }, + "triggerTime": { + "type": "string", + "description": "Time as String in triggerType = ProcessingTime or Continuous. See[[Trigger]] for details." + }, + "inputOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "additional option to apply when reading streaming source. This overwrites options set by the DataObjects." + }, + "outputOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "additional option to apply when writing to streaming sink. This overwrites options set by the DataObjects." + }, + "outputMode": { + "type": "string", + "enum": [ + "Append", + "Complete", + "Update" + ] + } + }, + "title": "SparkStreamingMode", + "required": [ + "type", + "checkpointLocation" + ], + "additionalProperties": false, + "description": "Spark streaming execution mode uses Spark Structured Streaming to incrementally execute data loads and keep track of processed data.\nThis mode needs a DataObject implementing CanCreateStreamingDataFrame and works only with SparkSubFeeds.\nThis mode can be executed synchronously in the DAG by using triggerType=Once, or asynchronously as Streaming Query with triggerType = ProcessingTime or Continuous." + } + }, + "ValidationRule": { + "RowLevelValidationRule": { + "type": "object", + "properties": { + "type": { + "const": "RowLevelValidationRule" + }, + "condition": { + "type": "string", + "description": "an SQL expression defining the condition to be tested. The condition should return true if the condition is satisfied." + }, + "errorMsg": { + "type": "string", + "description": "Optional error msg to be create if the condition fails. Default is to use a text representation of the condition." + } + }, + "title": "RowLevelValidationRule", + "required": [ + "type", + "condition" + ], + "additionalProperties": false, + "description": "Definition for a row level data validation rule." + } + }, + "Connection": { + "DeltaLakeTableConnection": { + "type": "object", + "properties": { + "type": { + "const": "DeltaLakeTableConnection" + }, + "catalog": { + "type": "string", + "description": "optional catalog to be used for this connection" + }, + "db": { + "type": "string", + "description": "hive db" + }, + "pathPrefix": { + "type": "string", + "description": "schema, authority and base path for tables directory on hadoop" + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "checkDeltaLakeSparkOptions": { + "type": "boolean" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "DeltaLakeTableConnection", + "required": [ + "type", + "db", + "pathPrefix" + ], + "additionalProperties": false, + "description": "Connection information for DeltaLake tables" + }, + "HadoopFileConnection": { + "type": "object", + "properties": { + "type": { + "const": "HadoopFileConnection" + }, + "pathPrefix": { + "type": "string", + "description": "schema, authority and base path for accessing files on hadoop" + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "HadoopFileConnection", + "required": [ + "type", + "pathPrefix" + ], + "additionalProperties": false, + "description": "Connection information for files on hadoop" + }, + "HiveTableConnection": { + "type": "object", + "properties": { + "type": { + "const": "HiveTableConnection" + }, + "db": { + "type": "string", + "description": "hive db" + }, + "pathPrefix": { + "type": "string", + "description": "optional schema, authority and base path for tables directory on hadoop." + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "HiveTableConnection", + "required": [ + "type", + "db" + ], + "additionalProperties": false, + "description": "Connection information for hive tables" + }, + "JdbcTableConnection": { + "type": "object", + "properties": { + "type": { + "const": "JdbcTableConnection" + }, + "url": { + "type": "string", + "description": "jdbc connection url" + }, + "driver": { + "type": "string", + "description": "class name of jdbc driver" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "optional authentication information: for now BasicAuthMode is supported." + }, + "db": { + "type": "string", + "description": "jdbc database" + }, + "maxParallelConnections": { + "type": "integer", + "description": "max number of parallel jdbc connections created by an instance of this connection, default is 3\nNote that Spark manages JDBC Connections on its own. This setting only applies to JDBC connection\nused by SDL for validating metadata or pre/postSQL." + }, + "connectionPoolMaxIdleTimeSec": { + "type": "integer", + "description": "timeout to close unused connections in the pool" + }, + "connectionPoolMaxWaitTimeSec": { + "type": "integer", + "description": "timeout when waiting for connection in pool to become available. Default is 600 seconds (10 minutes)." + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + }, + "autoCommit": { + "type": "boolean", + "description": "flag to enable or disable the auto-commit behaviour. When autoCommit is enabled, each database request is executed in its own transaction.\nDefault is autoCommit = false. It is not recommended to enable autoCommit as it will deactivate any transactional behaviour.", + "deprecated": true + }, + "connectionInitSql": { + "type": "string", + "description": "SQL statement to be executed every time a new connection is created, for example to set session parameters" + } + }, + "title": "JdbcTableConnection", + "required": [ + "type", + "url", + "driver" + ], + "additionalProperties": false, + "description": "Connection information for jdbc tables.\nIf authentication is needed, user and password must be provided." + }, + "KafkaConnection": { + "type": "object", + "properties": { + "type": { + "const": "KafkaConnection" + }, + "brokers": { + "type": "string", + "description": "comma separated list of kafka bootstrap server incl. port, e.g. \\\"host1:9092,host2:9092:" + }, + "schemaRegistry": { + "type": "string", + "description": "url of schema registry service, e.g. \\\"https://host2\\\"" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options for the Kafka stream reader (see https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html)" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ] + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "KafkaConnection", + "required": [ + "type", + "brokers" + ], + "additionalProperties": false, + "description": "Connection information for kafka" + }, + "SFtpFileRefConnection": { + "type": "object", + "properties": { + "type": { + "const": "SFtpFileRefConnection" + }, + "host": { + "type": "string", + "description": "sftp host" + }, + "port": { + "type": "integer", + "description": "port of sftp service, default is 22" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "authentication information: for now BasicAuthMode and PublicKeyAuthMode are supported." + }, + "proxy": { + "type": "object", + "properties": { + "host": { + "type": "string", + "description": "proxy host" + }, + "port": { + "type": "integer", + "description": "proxy port" + }, + "proxyType": { + "type": "string", + "description": "Type of proxy: HTTP or SOCKS. Default is HTTP.", + "enum": [ + "DIRECT", + "HTTP", + "SOCKS" + ] + } + }, + "title": "JavaNetProxyConfig", + "required": [ + "host", + "port" + ], + "additionalProperties": false, + "description": "Proxy configuration to create java.net.Proxy instance." + }, + "ignoreHostKeyVerification": { + "type": "boolean", + "description": "do not validate host key if true, default is false" + }, + "maxParallelConnections": { + "type": "integer", + "description": "number of parallel sftp connections created by an instance of this connection" + }, + "connectionPoolMaxIdleTimeSec": { + "type": "integer", + "description": "timeout to close unused connections in the pool" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "SFtpFileRefConnection", + "required": [ + "type", + "host", + "authMode" + ], + "additionalProperties": false, + "description": "SFTP Connection information" + }, + "SnowflakeConnection": { + "type": "object", + "properties": { + "type": { + "const": "SnowflakeConnection" + }, + "url": { + "type": "string", + "description": "snowflake connection url" + }, + "warehouse": { + "type": "string", + "description": "Snowflake namespace" + }, + "database": { + "type": "string", + "description": "Snowflake database" + }, + "role": { + "type": "string", + "description": "Snowflake role" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "optional authentication information: for now BasicAuthMode is supported." + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "SnowflakeConnection", + "required": [ + "type", + "url", + "warehouse", + "database", + "role", + "authMode" + ], + "additionalProperties": false, + "description": "Connection information for Snowflake databases.\nThe connection can be used for SnowflakeTableDataObjects\nIf multiple SnowflakeTableDataObjects share a connection, they share the same Snowpark session" + }, + "SplunkConnection": { + "type": "object", + "properties": { + "type": { + "const": "SplunkConnection" + }, + "host": { + "type": "string", + "description": "" + }, + "port": { + "type": "integer", + "description": "" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "SplunkConnection", + "required": [ + "type", + "host", + "port", + "authMode" + ], + "additionalProperties": false, + "description": "Connection information for splunk queries" + } + }, + "HousekeepingMode": { + "PartitionArchiveCompactionMode": { + "type": "object", + "properties": { + "type": { + "const": "PartitionArchiveCompactionMode" + }, + "archivePartitionExpression": { + "type": "string", + "description": "Expression to define the archive partition for a given partition. Define a spark\nsql expression working with the attributes of[[PartitionExpressionData]] returning archive\npartition values as Map[String,String]. If return value is the same as input elements, partition is not touched,\notherwise all files of the partition are moved to the returned partition definition.\nBe aware that the value of the partition columns changes for these files/records." + }, + "compactPartitionExpression": { + "type": "string", + "description": "Expression to define partitions which should be compacted. Define a spark\nsql expression working with the attributes of[[PartitionExpressionData]] returning a\nboolean = true when this partition should be compacted.\nOnce a partition is compacted, it is marked as compacted and will not be compacted again.\nIt is therefore ok to return true for all partitions which should be compacted, regardless if they have been compacted already." + }, + "description": { + "type": "string" + } + }, + "title": "PartitionArchiveCompactionMode", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Archive and compact old partitions:\nArchive partition reduces the number of partitions in the past by moving older partitions into special \\\"archive partitions\\\".\nCompact partition reduces the number of files in a partition by rewriting them with Spark.\nExample: archive and compact a table with partition layout run_id=\n- archive partitions after 1000 partitions into \\\"archive partition\\\" equal to floor(run_id/1000)\n- compact \\\"archive partition\\\" when full\n\n```\nhousekeepingMode = {\ntype = PartitionArchiveCompactionMode\narchivePartitionExpression = \\\"if( elements[\\'run_id\\'] < runId - 1000, map(\\'run_id\\', elements[\\'run_id\\'] div 1000), elements)\\\"\ncompactPartitionExpression = \\\"elements[\\'run_id\\'] % 1000 = 0 and elements[\\'run_id\\'] <= runId - 2000\\\"\n}\n```" + }, + "PartitionRetentionMode": { + "type": "object", + "properties": { + "type": { + "const": "PartitionRetentionMode" + }, + "retentionCondition": { + "type": "string", + "description": "Condition to decide if a partition should be kept. Define a spark sql expression\nworking with the attributes of[[PartitionExpressionData]] returning a boolean with value true if the partition should be kept." + }, + "description": { + "type": "string" + } + }, + "title": "PartitionRetentionMode", + "required": [ + "type", + "retentionCondition" + ], + "additionalProperties": false, + "description": "Keep partitions while retention condition is fulfilled, delete other partitions.\nExample: cleanup partitions with partition layout dt= after 90 days:\n\n```\nhousekeepingMode = {\ntype = PartitionRetentionMode\nretentionCondition = \\\"datediff(now(), to_date(elements[\\'dt\\'], \\'yyyyMMdd\\')) <= 90\\\"\n}\n```" + } + }, + "DataObject": { + "AccessTableDataObject": { + "type": "object", + "properties": { + "type": { + "const": "AccessTableDataObject" + }, + "path": { + "type": "string" + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "table": { + "$ref": "#/definitions/Others/Table" + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "AccessTableDataObject", + "required": [ + "type", + "path", + "table" + ], + "additionalProperties": false, + "description": "[[DataObject]]of type JDBC / Access.\nProvides access to a Access DB to an Action. The functionality is handled seperately from[[JdbcTableDataObject]] \nto avoid problems with net.ucanaccess.jdbc.UcanaccessDriver" + }, + "ActionsExporterDataObject": { + "type": "object", + "properties": { + "type": { + "const": "ActionsExporterDataObject" + }, + "config": { + "type": "string" + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "ActionsExporterDataObject", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Exports a util[[DataFrame]]that contains properties and metadata extracted from all[[io.smartdatalake.workflow.action.Action]]s\nthat are registered in the current[[InstanceRegistry]].\n\nAlternatively, it can export the properties and metadata of all[[io.smartdatalake.workflow.action.Action]]s defined in config files. For this, the\nconfiguration \\\"config\\\" has to be set to the location of the config.\n\nExample:\n\n```\ndataObjects = {\n...\nactions-exporter {\ntype = ActionsExporterDataObject\nconfig = path/to/myconfiguration.conf\n}\n...\n}\n```\n\n\nThe config value can point to a configuration file or a directory containing configuration files.\n\nSEE: Refer to[[ConfigLoader.loadConfigFromFilesystem()]] for details about the configuration loading." + }, + "AirbyteDataObject": { + "type": "object", + "properties": { + "type": { + "const": "AirbyteDataObject" + }, + "config": { + "type": "string", + "description": "Configuration for the source", + "existingJavaType": "com.typesafe.config.Config" + }, + "streamName": { + "type": "string", + "description": "The stream name to read. Must match an entry of the catalog of the source." + }, + "cmd": { + "oneOf": [ + { + "$ref": "#/definitions/ParsableScriptDef/CmdScript" + }, + { + "$ref": "#/definitions/ParsableScriptDef/DockerRunScript" + } + ], + "description": "command to launch airbyte connector. Normally this is of type[[DockerRunScript]] ." + }, + "incrementalCursorFields": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Some sources need a specification of the cursor field for incremental mode" + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "AirbyteDataObject", + "required": [ + "type", + "config", + "streamName", + "cmd" + ], + "additionalProperties": false, + "description": "Limitations: Connectors have only access to locally mounted directories" + }, + "AvroFileDataObject": { + "type": "object", + "properties": { + "type": { + "const": "AvroFileDataObject" + }, + "path": { + "type": "string", + "description": "The root path of the files that are handled by this DataObject.\nFor most DataObjects this can be a directory or a specific file." + }, + "partitions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Definition of partition columns" + }, + "avroOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Settings for the underlying[[org.apache.spark.sql.DataFrameReader]]and\n[[org.apache.spark.sql.DataFrameWriter]] ." + }, + "schema": { + "type": "string", + "description": "An optional schema for the spark data frame to be validated on read and write. Note: Existing Avro files\ncontain a source schema. Therefore, this schema is ignored when reading from existing Avro files.\nAs this corresponds to the schema on write, it must not include the optional filenameColumn on read.\nDefine the schema by using one of the schema providers DDL, jsonSchemaFile, avroSchemaFile, xsdFile or caseClassName.\nThe schema provider and its configuration value must be provided in the format #.\nA DDL-formatted string is a comma separated list of field definitions, e.g., a INT, b STRING." + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "saveMode": { + "type": "string", + "description": "Overwrite or Append new data.\nWhen writing partitioned data, this applies only to partitions concerned.", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "sparkRepartition": { + "type": "object", + "properties": { + "numberOfTasksPerPartition": { + "type": "integer", + "description": "Number of Spark tasks to create per partition before writing to DataObject by repartitioning the DataFrame.\nThis controls how many files are created in each Hadoop partition." + }, + "keyCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional key columns to distribute records over Spark tasks inside a Hadoop partition.\nIf DataObject has Hadoop partitions defined, keyCols must be defined." + }, + "sortCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional columns to sort records inside files created." + }, + "filename": { + "type": "string", + "description": "Option filename to rename target file(s). If numberOfTasksPerPartition is greater than 1,\nmultiple files can exist in a directory and a number is inserted into the filename after the first \\'.\\'.\nExample: filename=data.csv -> files created are data.1.csv, data.2.csv, ..." + } + }, + "title": "SparkRepartitionDef", + "required": [ + "numberOfTasksPerPartition" + ], + "additionalProperties": false, + "description": "This controls repartitioning of the DataFrame before writing with Spark to Hadoop.\n\nWhen writing multiple partitions of a partitioned DataObject, the number of spark tasks created is equal to numberOfTasksPerPartition\nmultiplied with the number of partitions to write. To spread the records of a partition only over numberOfTasksPerPartition spark tasks,\nkeyCols must be given which are used to derive a task number inside the partition (hashvalue(keyCols) modulo numberOfTasksPerPartition).\n\nWhen writing to an unpartitioned DataObject or only one partition of a partitioned DataObject, the number of spark tasks created is equal\nto numberOfTasksPerPartition. Optional keyCols can be used to keep corresponding records together in the same task/file." + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "connectionId": { + "type": "string", + "description": "Return the connection id.\n\nConnection defines path prefix (scheme, authority, base path) and ACL\\'s in central location." + }, + "filenameColumn": { + "type": "string", + "description": "The name of the (optional) additional column containing the source filename" + }, + "expectedPartitionsCondition": { + "type": "string", + "description": "Optional definition of partitions expected to exist.\nDefine a Spark SQL expression that is evaluated against a[[PartitionValues]] instance and returns true or false\nDefault is to expect all partitions to exist." + }, + "housekeepingMode": { + "oneOf": [ + { + "$ref": "#/definitions/HousekeepingMode/PartitionArchiveCompactionMode" + }, + { + "$ref": "#/definitions/HousekeepingMode/PartitionRetentionMode" + } + ], + "description": "Optional definition of a housekeeping mode applied after every write. E.g. it can be used to cleanup, archive and compact partitions.\nSee HousekeepingMode for available implementations. Default is None." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "AvroFileDataObject", + "required": [ + "type", + "path" + ], + "additionalProperties": false, + "description": "A[[io.smartdatalake.workflow.dataobject.DataObject]]backed by an Avro data source.\n\nIt manages read and write access and configurations required for[[io.smartdatalake.workflow.action.Action]]s to\nwork on Avro formatted files.\n\nReading and writing details are delegated to Apache Spark[[org.apache.spark.sql.DataFrameReader]]\nand[[org.apache.spark.sql.DataFrameWriter]]respectively. The reader and writer implementations are provided by\nthe[[https://github.com/databricks/spark-avro databricks spark-avro]] project.\n\nSEE: [[org.apache.spark.sql.DataFrameReader]]\n\nSEE: [[org.apache.spark.sql.DataFrameWriter]]" + }, + "CsvFileDataObject": { + "type": "object", + "properties": { + "type": { + "const": "CsvFileDataObject" + }, + "path": { + "type": "string", + "description": "The root path of the files that are handled by this DataObject.\nFor most DataObjects this can be a directory or a specific file." + }, + "csvOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Settings for the underlying[[org.apache.spark.sql.DataFrameReader]]and[[org.apache.spark.sql.DataFrameWriter]] ." + }, + "partitions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Definition of partition columns" + }, + "schema": { + "type": "string", + "description": "An optional data object schema. If defined, any automatic schema inference is avoided.\nAs this corresponds to the schema on write, it must not include the optional filenameColumn on read.\nDefine the schema by using one of the schema providers DDL, jsonSchemaFile, xsdFile or caseClassName.\nThe schema provider and its configuration value must be provided in the format #.\nA DDL-formatted string is a comma separated list of field definitions, e.g., a INT, b STRING." + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "dateColumnType": { + "type": "string", + "description": "Specifies the string format used for writing date typed data.", + "enum": [ + "Default ", + "String ", + "Date " + ] + }, + "saveMode": { + "type": "string", + "description": "Overwrite or Append new data.\nWhen writing partitioned data, this applies only to partitions concerned.", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "sparkRepartition": { + "type": "object", + "properties": { + "numberOfTasksPerPartition": { + "type": "integer", + "description": "Number of Spark tasks to create per partition before writing to DataObject by repartitioning the DataFrame.\nThis controls how many files are created in each Hadoop partition." + }, + "keyCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional key columns to distribute records over Spark tasks inside a Hadoop partition.\nIf DataObject has Hadoop partitions defined, keyCols must be defined." + }, + "sortCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional columns to sort records inside files created." + }, + "filename": { + "type": "string", + "description": "Option filename to rename target file(s). If numberOfTasksPerPartition is greater than 1,\nmultiple files can exist in a directory and a number is inserted into the filename after the first \\'.\\'.\nExample: filename=data.csv -> files created are data.1.csv, data.2.csv, ..." + } + }, + "title": "SparkRepartitionDef", + "required": [ + "numberOfTasksPerPartition" + ], + "additionalProperties": false, + "description": "This controls repartitioning of the DataFrame before writing with Spark to Hadoop.\n\nWhen writing multiple partitions of a partitioned DataObject, the number of spark tasks created is equal to numberOfTasksPerPartition\nmultiplied with the number of partitions to write. To spread the records of a partition only over numberOfTasksPerPartition spark tasks,\nkeyCols must be given which are used to derive a task number inside the partition (hashvalue(keyCols) modulo numberOfTasksPerPartition).\n\nWhen writing to an unpartitioned DataObject or only one partition of a partitioned DataObject, the number of spark tasks created is equal\nto numberOfTasksPerPartition. Optional keyCols can be used to keep corresponding records together in the same task/file." + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "connectionId": { + "type": "string", + "description": "Return the connection id.\n\nConnection defines path prefix (scheme, authority, base path) and ACL\\'s in central location." + }, + "filenameColumn": { + "type": "string", + "description": "The name of the (optional) additional column containing the source filename" + }, + "expectedPartitionsCondition": { + "type": "string", + "description": "Optional definition of partitions expected to exist.\nDefine a Spark SQL expression that is evaluated against a[[PartitionValues]] instance and returns true or false\nDefault is to expect all partitions to exist." + }, + "housekeepingMode": { + "oneOf": [ + { + "$ref": "#/definitions/HousekeepingMode/PartitionArchiveCompactionMode" + }, + { + "$ref": "#/definitions/HousekeepingMode/PartitionRetentionMode" + } + ], + "description": "Optional definition of a housekeeping mode applied after every write. E.g. it can be used to cleanup, archive and compact partitions.\nSee HousekeepingMode for available implementations. Default is None." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "CsvFileDataObject", + "required": [ + "type", + "path" + ], + "additionalProperties": false, + "description": "A[[DataObject]]backed by a comma-separated value (CSV) data source.\n\nIt manages read and write access and configurations required for[[io.smartdatalake.workflow.action.Action]]s to\nwork on CSV formatted files.\n\nCSV reading and writing details are delegated to Apache Spark[[org.apache.spark.sql.DataFrameReader]]\nand[[org.apache.spark.sql.DataFrameWriter]]respectively.\n\nRead Schema specifications:\n\nIf a data object schema is not defined via the`schema`attribute (default) and`inferSchema`option is\ndisabled (default) in`csvOptions`, then all column types are set to String and the first row of the CSV file is read\nto determine the column names and the number of fields.\n\nIf the`header`option is disabled (default) in`csvOptions`, then the header is defined as \\\"_c#\\\" for each column\nwhere \\\"#\\\" is the column index.\nOtherwise the first row of the CSV file is not included in the DataFrame content and its entries\nare used as the column names for the schema.\n\nIf a data object schema is not defined via the`schema`attribute and`inferSchema`is enabled in`csvOptions`, then\nthe`samplingRatio`(default: 1.0) option in`csvOptions` is used to extract a sample from the CSV file in order to\ndetermine the input schema automatically.\n\nNOTE: This data object sets the following default values for`csvOptions`: delimiter = \\\"|\\\", quote = null, header = false, and inferSchema = false.\nAll other`csvOption` default to the values defined by Apache Spark.\n\nSEE: [[org.apache.spark.sql.DataFrameReader]]\n\nSEE: [[org.apache.spark.sql.DataFrameWriter]]" + }, + "CustomDfDataObject": { + "type": "object", + "properties": { + "type": { + "const": "CustomDfDataObject" + }, + "creator": { + "type": "object", + "properties": { + "className": { + "type": "string", + "description": "Optional class name implementing trait[[CustomDfCreator]]" + }, + "scalaFile": { + "type": "string", + "description": "Optional file where scala code for creator is loaded from. The scala code in the file needs to be a function of type[[fnExecType]] ." + }, + "scalaCode": { + "type": "string", + "description": "Optional scala code for creator. The scala code needs to be a function of type[[fnExecType]] ." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the creator" + } + }, + "title": "CustomDfCreatorConfig", + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame creator as part of[[CustomDfDataObject]]\nDefine a exec function which receives a map of options and returns a DataFrame to be used as input.\nOptionally define a schema function to return a StructType used as schema in init-phase.\nSee also trait[[CustomDfCreator]] .\n\nNote that for now implementing CustomDfCreator.schema method is only possible with className configuration attribute." + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "CustomDfDataObject", + "required": [ + "type", + "creator" + ], + "additionalProperties": false, + "description": "Generic[[DataObject]] containing a config object.\nE.g. used to implement a CustomAction that reads a Webservice." + }, + "CustomFileDataObject": { + "type": "object", + "properties": { + "type": { + "const": "CustomFileDataObject" + }, + "creator": { + "type": "object", + "properties": { + "className": { + "type": "string" + }, + "scalaFile": { + "type": "string" + }, + "scalaCode": { + "type": "string" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + } + } + }, + "title": "CustomFileCreatorConfig", + "additionalProperties": false + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "CustomFileDataObject", + "required": [ + "type", + "creator" + ], + "additionalProperties": false + }, + "DataObjectsExporterDataObject": { + "type": "object", + "properties": { + "type": { + "const": "DataObjectsExporterDataObject" + }, + "config": { + "type": "string" + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "DataObjectsExporterDataObject", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Exports a util[[DataFrame]]that contains properties and metadata extracted from all[[DataObject]]s\nthat are registered in the current[[InstanceRegistry]].\n\nAlternatively, it can export the properties and metadata of all[[DataObject]]s defined in config files. For this, the\nconfiguration \\\"config\\\" has to be set to the location of the config.\n\nExample:\n\n```\ndataObjects = {\n...\ndataobject-exporter {\ntype = DataObjectsExporterDataObject\nconfig = path/to/myconfiguration.conf\n}\n...\n}\n```\n\n\nThe config value can point to a configuration file or a directory containing configuration files.\n\nSEE: Refer to[[ConfigLoader.loadConfigFromFilesystem()]] for details about the configuration loading." + }, + "DeltaLakeTableDataObject": { + "type": "object", + "properties": { + "type": { + "const": "DeltaLakeTableDataObject" + }, + "path": { + "type": "string", + "description": "hadoop directory for this table. If it doesn\\'t contain scheme and authority, the connections pathPrefix is applied.\nIf pathPrefix is not defined or doesn\\'t define scheme and authority, default schema and authority is applied." + }, + "partitions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "partition columns for this data object" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options for Delta Lake tables see:[[https://docs.delta.io/latest/delta-batch.html]]and[[org.apache.spark.sql.delta.DeltaOptions]]" + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that this DataObject must have to pass schema validation on reading and writing.\nDefine schema by using a DDL-formatted string, which is a comma separated list of field definitions, e.g., a INT, b STRING." + }, + "table": { + "$ref": "#/definitions/Others/Table" + }, + "constraints": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "name of the constraint" + }, + "description": { + "type": "string", + "description": "optional detailed description of the constraint" + }, + "expression": { + "type": "string", + "description": "SQL expression to evaluate on every row. The expressions return value should be a boolean.\nIf it evaluates to true the constraint is validated successfully, otherwise it will throw an exception." + }, + "errorMsgCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional list of column names to add to error message.\nNote that primary key columns are always included.\nIf there is no primary key defined, by default all columns with simple datatype are included in the error message." + } + }, + "title": "Constraint", + "required": [ + "name", + "expression" + ], + "additionalProperties": false, + "description": "Definition of row-level constraint to validate." + }, + "description": "List of row-level[[Constraint]] s to enforce when writing to this data object." + }, + "expectations": { + "type": "array", + "items": { + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "const": "AvgCountPerPartitionExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'> 100000\\\".\nIf no expectation is defined, the result value is just recorded in metrics." + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "AvgCountPerPartitionExpectation", + "required": [ + "name", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on the average number of records per partitions.\n\nNote that the scope for evaluating this expectation is fixed to Job." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLQueryExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "code": { + "type": "string", + "description": "a SQL query returning a single row. All column will be added as metrics.\nIf there are more than one column, there has to be one column with the same name as this expectation. This column will be used to compare against a potential condition of the expectation.\nThe special token %{inputViewName} must be used to insert the temporary view name used to provide the DataFrame to the query." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'= 0\\\".\nTogether with the result of the aggExpression evaluation on the left side, it forms the condition to validate the expectation.\nIf no expectation is defined, the aggExpression evaluation result is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLQueryExpectation", + "required": [ + "name", + "code", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on a SQL query to be evaluate on dataset-level.\nThe SQL query will be evaluated in a separate Spark job against the DataFrame." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "aggExpression": { + "type": "string", + "description": "SQL aggregate expression to evaluate on dataset, e.g. count(*)." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'= 0\\\".\nTogether with the result of the aggExpression evaluation on the left side, it forms the condition to validate the expectation.\nIf no expectation is defined, the aggExpression evaluation result is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLExpectation", + "required": [ + "name", + "aggExpression", + "type" + ], + "additionalProperties": false, + "description": "Definition of expectation based on a SQL aggregate expression to evaluate on dataset-level." + }, + { + "type": "object", + "properties": { + "type": { + "const": "ExpectationDefaultImpl" + } + }, + "title": "ExpectationDefaultImpl", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Default implementation for getValidationErrorColumn for metric of type`any` ." + }, + { + "type": "object", + "properties": { + "type": { + "const": "CountExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'> 100000\\\".\nIf no expectation is defined, the result value is is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "CountExpectation", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on the number of records." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLFractionExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "countConditionExpression": { + "type": "string", + "description": "SQL expression returning a boolean to match the rows to count for the fraction." + }, + "globalConditionExpression": { + "type": "string", + "description": "SQL expression returning a boolean used as global filter, e.g. fraction row count and total row count are filtered with global filter before counting." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected percentage for validation, e.g. \\'= 0.9\\\".\nIf no expectation is defined, the result value is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLFractionExpectation", + "required": [ + "name", + "countConditionExpression", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on counting how many rows match an expression vs the number of all rows.\nThe fraction of these two counts is compared against a given expectation." + } + ] + }, + "description": "List of[[Expectation]] s to enforce when writing to this data object. Expectations are checks based on aggregates over all rows of a dataset." + }, + "saveMode": { + "type": "string", + "description": "[[SDLSaveMode]] to use when writing files, default is \\\"overwrite\\\". Overwrite, Append and Merge are supported for now.", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "allowSchemaEvolution": { + "type": "boolean", + "description": "If set to true schema evolution will automatically occur when writing to this DataObject with different schema, otherwise SDL will stop with error." + }, + "retentionPeriod": { + "type": "integer", + "description": "Optional delta lake retention threshold in hours. Files required by the table for reading versions younger than retentionPeriod will be preserved and the rest of them will be deleted." + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "connectionId": { + "type": "string", + "description": "optional id of[[io.smartdatalake.workflow.connection.HiveTableConnection]]" + }, + "expectedPartitionsCondition": { + "type": "string", + "description": "Optional definition of partitions expected to exist.\nDefine a Spark SQL expression that is evaluated against a[[PartitionValues]] instance and returns true or false\nDefault is to expect all partitions to exist." + }, + "housekeepingMode": { + "oneOf": [ + { + "$ref": "#/definitions/HousekeepingMode/PartitionArchiveCompactionMode" + }, + { + "$ref": "#/definitions/HousekeepingMode/PartitionRetentionMode" + } + ], + "description": "Optional definition of a housekeeping mode applied after every write. E.g. it can be used to cleanup, archive and compact partitions.\nSee HousekeepingMode for available implementations. Default is None." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "DeltaLakeTableDataObject", + "required": [ + "type", + "table" + ], + "additionalProperties": false, + "description": "[[DataObject]]of type DeltaLakeTableDataObject.\nProvides details to access Tables in delta format to an Action.\n\nDelta format maintains a transaction log in a separate _delta_log subfolder.\nThe schema is registered in Metastore by DeltaLakeTableDataObject.\n\nThe following anomalies might occur:\n- table is registered in metastore but path does not exist -> table is dropped from metastore\n- table is registered in metastore but path is empty -> error is thrown. Delete the path to clean up\n- table is registered and path contains parquet files, but _delta_log subfolder is missing -> path is converted to delta format\n- table is not registered but path contains parquet files and _delta_log subfolder -> Table is registered\n- table is not registered but path contains parquet files without _delta_log subfolder -> path is converted to delta format and table is registered\n- table is not registered and path does not exists -> table is created on write\n\n* DeltaLakeTableDataObject implements\n-[[CanMergeDataFrame]]by using DeltaTable.merge API.\n-[[CanEvolveSchema]] by using mergeSchema option.\n- Overwriting partitions is implemented by replaceWhere option in one transaction." + }, + "ExcelFileDataObject": { + "type": "object", + "properties": { + "type": { + "const": "ExcelFileDataObject" + }, + "path": { + "type": "string", + "description": "The root path of the files that are handled by this DataObject.\nFor most DataObjects this can be a directory or a specific file." + }, + "excelOptions": { + "type": "object", + "properties": { + "sheetName": { + "type": "string", + "description": "Optional name of the Excel Sheet to read from/write to." + }, + "numLinesToSkip": { + "type": "integer", + "description": "Optional number of rows in the excel spreadsheet to skip before any data is read.\nThis option must not be set for writing." + }, + "startColumn": { + "type": "string", + "description": "Optional first column in the specified Excel Sheet to read from (as string, e.g B).\nThis option must not be set for writing." + }, + "endColumn": { + "type": "string", + "description": "Optional last column in the specified Excel Sheet to read from (as string, e.g. F)." + }, + "rowLimit": { + "type": "integer", + "description": "Optional limit of the number of rows being returned on read.\nThis is applied after`numLinesToSkip` ." + }, + "useHeader": { + "type": "boolean", + "description": "If`true` , the first row of the excel sheet specifies the column names (default: true)." + }, + "treatEmptyValuesAsNulls": { + "type": "boolean", + "description": "Empty cells are parsed as`null` values (default: true)." + }, + "inferSchema": { + "type": "boolean", + "description": "Infer the schema of the excel sheet automatically (default: true)." + }, + "timestampFormat": { + "type": "string", + "description": "A format string specifying the format to use when writing timestamps (default: dd-MM-yyyy HH:mm:ss)." + }, + "dateFormat": { + "type": "string", + "description": "A format string specifying the format to use when writing dates." + }, + "maxRowsInMemory": { + "type": "integer", + "description": "The number of rows that are stored in memory.\nIf set, a streaming reader is used which can help with big files." + }, + "excerptSize": { + "type": "integer", + "description": "Sample size for schema inference." + } + }, + "title": "ExcelOptions", + "additionalProperties": false, + "description": "Options passed to[[org.apache.spark.sql.DataFrameReader]]and[[org.apache.spark.sql.DataFrameWriter]] for\nreading and writing Microsoft Excel files. Excel support is provided by the spark-excel project (see link below).\n\nSEE: [[https://github.com/crealytics/spark-excel]]" + }, + "partitions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Definition of partition columns" + }, + "schema": { + "type": "string", + "description": "An optional data object schema. If defined, any automatic schema inference is avoided.\nAs this corresponds to the schema on write, it must not include the optional filenameColumn on read.\nDefine the schema by using one of the schema providers DDL, jsonSchemaFile, xsdFile or caseClassName.\nThe schema provider and its configuration value must be provided in the format #.\nA DDL-formatted string is a comma separated list of field definitions, e.g., a INT, b STRING." + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "saveMode": { + "type": "string", + "description": "Overwrite or Append new data.\nWhen writing partitioned data, this applies only to partitions concerned.", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "sparkRepartition": { + "type": "object", + "properties": { + "numberOfTasksPerPartition": { + "type": "integer", + "description": "Number of Spark tasks to create per partition before writing to DataObject by repartitioning the DataFrame.\nThis controls how many files are created in each Hadoop partition." + }, + "keyCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional key columns to distribute records over Spark tasks inside a Hadoop partition.\nIf DataObject has Hadoop partitions defined, keyCols must be defined." + }, + "sortCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional columns to sort records inside files created." + }, + "filename": { + "type": "string", + "description": "Option filename to rename target file(s). If numberOfTasksPerPartition is greater than 1,\nmultiple files can exist in a directory and a number is inserted into the filename after the first \\'.\\'.\nExample: filename=data.csv -> files created are data.1.csv, data.2.csv, ..." + } + }, + "title": "SparkRepartitionDef", + "required": [ + "numberOfTasksPerPartition" + ], + "additionalProperties": false, + "description": "This controls repartitioning of the DataFrame before writing with Spark to Hadoop.\n\nWhen writing multiple partitions of a partitioned DataObject, the number of spark tasks created is equal to numberOfTasksPerPartition\nmultiplied with the number of partitions to write. To spread the records of a partition only over numberOfTasksPerPartition spark tasks,\nkeyCols must be given which are used to derive a task number inside the partition (hashvalue(keyCols) modulo numberOfTasksPerPartition).\n\nWhen writing to an unpartitioned DataObject or only one partition of a partitioned DataObject, the number of spark tasks created is equal\nto numberOfTasksPerPartition. Optional keyCols can be used to keep corresponding records together in the same task/file." + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "connectionId": { + "type": "string", + "description": "Return the connection id.\n\nConnection defines path prefix (scheme, authority, base path) and ACL\\'s in central location." + }, + "filenameColumn": { + "type": "string", + "description": "The name of the (optional) additional column containing the source filename" + }, + "expectedPartitionsCondition": { + "type": "string", + "description": "Optional definition of partitions expected to exist.\nDefine a Spark SQL expression that is evaluated against a[[PartitionValues]] instance and returns true or false\nDefault is to expect all partitions to exist." + }, + "housekeepingMode": { + "oneOf": [ + { + "$ref": "#/definitions/HousekeepingMode/PartitionArchiveCompactionMode" + }, + { + "$ref": "#/definitions/HousekeepingMode/PartitionRetentionMode" + } + ], + "description": "Optional definition of a housekeeping mode applied after every write. E.g. it can be used to cleanup, archive and compact partitions.\nSee HousekeepingMode for available implementations. Default is None." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "ExcelFileDataObject", + "required": [ + "type", + "path" + ], + "additionalProperties": false, + "description": "A[[DataObject]]backed by an Microsoft Excel data source.\n\nIt manages read and write access and configurations required for[[io.smartdatalake.workflow.action.Action]]s to\nwork on Microsoft Excel (.xslx) formatted files.\n\nReading and writing details are delegated to Apache Spark[[org.apache.spark.sql.DataFrameReader]]\nand[[org.apache.spark.sql.DataFrameWriter]]respectively. The reader and writer implementation is provided by the\n[[https://github.com/crealytics/spark-excel Crealytics spark-excel]]project.\n\nRead Schema:\n\nWhen`useHeader`is set to true (default), the reader will use the first row of the Excel sheet as column names for\nthe schema and not include the first row as data values. Otherwise the column names are taken from the schema.\nIf the schema is not provided or inferred, then each column name is defined as \\\"_c#\\\" where \\\"#\\\" is the column index.\n\nWhen a data object schema is provided, it is used as the schema for the DataFrame. Otherwise if`inferSchema`is\nenabled (default), then the data types of the columns are inferred based on the first`excerptSize`rows\n(excluding the first).\nWhen no schema is provided and`inferSchema` is disabled, all columns are assumed to be of string type." + }, + "HiveTableDataObject": { + "type": "object", + "properties": { + "type": { + "const": "HiveTableDataObject" + }, + "path": { + "type": "string", + "description": "hadoop directory for this table. If it doesn\\'t contain scheme and authority, the connections pathPrefix is applied.\nIf pathPrefix is not defined or doesn\\'t define scheme and authority, default schema and authority is applied.\nIf DataObject is only used for reading or if the HiveTable already exist, the path can be omitted.\nIf the HiveTable already exists but with a different path, a warning is issued" + }, + "partitions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "partition columns for this data object" + }, + "analyzeTableAfterWrite": { + "type": "boolean", + "description": "enable compute statistics after writing data (default=false)" + }, + "dateColumnType": { + "type": "string", + "description": "type of date column", + "enum": [ + "Default ", + "String ", + "Date " + ] + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that this DataObject must have to pass schema validation on reading and writing.\nDefine schema by using a DDL-formatted string, which is a comma separated list of field definitions, e.g., a INT, b STRING." + }, + "table": { + "$ref": "#/definitions/Others/Table" + }, + "constraints": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "name of the constraint" + }, + "description": { + "type": "string", + "description": "optional detailed description of the constraint" + }, + "expression": { + "type": "string", + "description": "SQL expression to evaluate on every row. The expressions return value should be a boolean.\nIf it evaluates to true the constraint is validated successfully, otherwise it will throw an exception." + }, + "errorMsgCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional list of column names to add to error message.\nNote that primary key columns are always included.\nIf there is no primary key defined, by default all columns with simple datatype are included in the error message." + } + }, + "title": "Constraint", + "required": [ + "name", + "expression" + ], + "additionalProperties": false, + "description": "Definition of row-level constraint to validate." + }, + "description": "List of row-level[[Constraint]] s to enforce when writing to this data object." + }, + "expectations": { + "type": "array", + "items": { + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "const": "AvgCountPerPartitionExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'> 100000\\\".\nIf no expectation is defined, the result value is just recorded in metrics." + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "AvgCountPerPartitionExpectation", + "required": [ + "name", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on the average number of records per partitions.\n\nNote that the scope for evaluating this expectation is fixed to Job." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLQueryExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "code": { + "type": "string", + "description": "a SQL query returning a single row. All column will be added as metrics.\nIf there are more than one column, there has to be one column with the same name as this expectation. This column will be used to compare against a potential condition of the expectation.\nThe special token %{inputViewName} must be used to insert the temporary view name used to provide the DataFrame to the query." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'= 0\\\".\nTogether with the result of the aggExpression evaluation on the left side, it forms the condition to validate the expectation.\nIf no expectation is defined, the aggExpression evaluation result is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLQueryExpectation", + "required": [ + "name", + "code", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on a SQL query to be evaluate on dataset-level.\nThe SQL query will be evaluated in a separate Spark job against the DataFrame." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "aggExpression": { + "type": "string", + "description": "SQL aggregate expression to evaluate on dataset, e.g. count(*)." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'= 0\\\".\nTogether with the result of the aggExpression evaluation on the left side, it forms the condition to validate the expectation.\nIf no expectation is defined, the aggExpression evaluation result is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLExpectation", + "required": [ + "name", + "aggExpression", + "type" + ], + "additionalProperties": false, + "description": "Definition of expectation based on a SQL aggregate expression to evaluate on dataset-level." + }, + { + "type": "object", + "properties": { + "type": { + "const": "ExpectationDefaultImpl" + } + }, + "title": "ExpectationDefaultImpl", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Default implementation for getValidationErrorColumn for metric of type`any` ." + }, + { + "type": "object", + "properties": { + "type": { + "const": "CountExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'> 100000\\\".\nIf no expectation is defined, the result value is is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "CountExpectation", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on the number of records." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLFractionExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "countConditionExpression": { + "type": "string", + "description": "SQL expression returning a boolean to match the rows to count for the fraction." + }, + "globalConditionExpression": { + "type": "string", + "description": "SQL expression returning a boolean used as global filter, e.g. fraction row count and total row count are filtered with global filter before counting." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected percentage for validation, e.g. \\'= 0.9\\\".\nIf no expectation is defined, the result value is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLFractionExpectation", + "required": [ + "name", + "countConditionExpression", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on counting how many rows match an expression vs the number of all rows.\nThe fraction of these two counts is compared against a given expectation." + } + ] + }, + "description": "List of[[Expectation]] s to enforce when writing to this data object. Expectations are checks based on aggregates over all rows of a dataset." + }, + "numInitialHdfsPartitions": { + "type": "integer", + "description": "number of files created when writing into an empty table (otherwise the number will be derived from the existing data)" + }, + "saveMode": { + "type": "string", + "description": "spark[[SaveMode]] to use when writing files, default is \\\"overwrite\\\"", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "connectionId": { + "type": "string", + "description": "optional id of[[io.smartdatalake.workflow.connection.HiveTableConnection]]" + }, + "expectedPartitionsCondition": { + "type": "string", + "description": "Optional definition of partitions expected to exist.\nDefine a Spark SQL expression that is evaluated against a[[PartitionValues]] instance and returns true or false\nDefault is to expect all partitions to exist." + }, + "housekeepingMode": { + "oneOf": [ + { + "$ref": "#/definitions/HousekeepingMode/PartitionArchiveCompactionMode" + }, + { + "$ref": "#/definitions/HousekeepingMode/PartitionRetentionMode" + } + ], + "description": "Optional definition of a housekeeping mode applied after every write. E.g. it can be used to cleanup, archive and compact partitions.\nSee HousekeepingMode for available implementations. Default is None." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "HiveTableDataObject", + "required": [ + "type", + "table" + ], + "additionalProperties": false, + "description": "[[DataObject]] of type Hive.\nProvides details to access Hive tables to an Action" + }, + "JdbcTableDataObject": { + "type": "object", + "properties": { + "type": { + "const": "JdbcTableDataObject" + }, + "createSql": { + "type": "string", + "description": "DDL-statement to be executed in prepare phase, using output jdbc connection.\nNote that it is also possible to let Spark create the table in Init-phase. See jdbcOptions to customize column data types for auto-created DDL-statement." + }, + "preReadSql": { + "type": "string", + "description": "SQL-statement to be executed in exec phase before reading input table, using input jdbc connection.\nUse tokens with syntax %{} to substitute with values from[[DefaultExpressionData]] ." + }, + "postReadSql": { + "type": "string", + "description": "SQL-statement to be executed in exec phase after reading input table and before action is finished, using input jdbc connection\nUse tokens with syntax %{} to substitute with values from[[DefaultExpressionData]] ." + }, + "preWriteSql": { + "type": "string", + "description": "SQL-statement to be executed in exec phase before writing output table, using output jdbc connection\nUse tokens with syntax %{} to substitute with values from[[DefaultExpressionData]] ." + }, + "postWriteSql": { + "type": "string", + "description": "SQL-statement to be executed in exec phase after writing output table, using output jdbc connection\nUse tokens with syntax %{} to substitute with values from[[DefaultExpressionData]] ." + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that this DataObject must have to pass schema validation on reading and writing.\nDefine schema by using a DDL-formatted string, which is a comma separated list of field definitions, e.g., a INT, b STRING." + }, + "table": { + "$ref": "#/definitions/Others/Table" + }, + "constraints": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "name of the constraint" + }, + "description": { + "type": "string", + "description": "optional detailed description of the constraint" + }, + "expression": { + "type": "string", + "description": "SQL expression to evaluate on every row. The expressions return value should be a boolean.\nIf it evaluates to true the constraint is validated successfully, otherwise it will throw an exception." + }, + "errorMsgCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional list of column names to add to error message.\nNote that primary key columns are always included.\nIf there is no primary key defined, by default all columns with simple datatype are included in the error message." + } + }, + "title": "Constraint", + "required": [ + "name", + "expression" + ], + "additionalProperties": false, + "description": "Definition of row-level constraint to validate." + }, + "description": "List of row-level[[Constraint]] s to enforce when writing to this data object." + }, + "expectations": { + "type": "array", + "items": { + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "const": "AvgCountPerPartitionExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'> 100000\\\".\nIf no expectation is defined, the result value is just recorded in metrics." + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "AvgCountPerPartitionExpectation", + "required": [ + "name", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on the average number of records per partitions.\n\nNote that the scope for evaluating this expectation is fixed to Job." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLQueryExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "code": { + "type": "string", + "description": "a SQL query returning a single row. All column will be added as metrics.\nIf there are more than one column, there has to be one column with the same name as this expectation. This column will be used to compare against a potential condition of the expectation.\nThe special token %{inputViewName} must be used to insert the temporary view name used to provide the DataFrame to the query." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'= 0\\\".\nTogether with the result of the aggExpression evaluation on the left side, it forms the condition to validate the expectation.\nIf no expectation is defined, the aggExpression evaluation result is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLQueryExpectation", + "required": [ + "name", + "code", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on a SQL query to be evaluate on dataset-level.\nThe SQL query will be evaluated in a separate Spark job against the DataFrame." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "aggExpression": { + "type": "string", + "description": "SQL aggregate expression to evaluate on dataset, e.g. count(*)." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'= 0\\\".\nTogether with the result of the aggExpression evaluation on the left side, it forms the condition to validate the expectation.\nIf no expectation is defined, the aggExpression evaluation result is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLExpectation", + "required": [ + "name", + "aggExpression", + "type" + ], + "additionalProperties": false, + "description": "Definition of expectation based on a SQL aggregate expression to evaluate on dataset-level." + }, + { + "type": "object", + "properties": { + "type": { + "const": "ExpectationDefaultImpl" + } + }, + "title": "ExpectationDefaultImpl", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Default implementation for getValidationErrorColumn for metric of type`any` ." + }, + { + "type": "object", + "properties": { + "type": { + "const": "CountExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'> 100000\\\".\nIf no expectation is defined, the result value is is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "CountExpectation", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on the number of records." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLFractionExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "countConditionExpression": { + "type": "string", + "description": "SQL expression returning a boolean to match the rows to count for the fraction." + }, + "globalConditionExpression": { + "type": "string", + "description": "SQL expression returning a boolean used as global filter, e.g. fraction row count and total row count are filtered with global filter before counting." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected percentage for validation, e.g. \\'= 0.9\\\".\nIf no expectation is defined, the result value is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLFractionExpectation", + "required": [ + "name", + "countConditionExpression", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on counting how many rows match an expression vs the number of all rows.\nThe fraction of these two counts is compared against a given expectation." + } + ] + }, + "description": "List of[[Expectation]] s to enforce when writing to this data object. Expectations are checks based on aggregates over all rows of a dataset." + }, + "jdbcFetchSize": { + "type": "integer", + "description": "Number of rows to be fetched together by the Jdbc driver" + }, + "saveMode": { + "type": "string", + "description": "[[SDLSaveMode]] to use when writing table, default is \\\"Overwrite\\\". Only \\\"Append\\\" and \\\"Overwrite\\\" supported.", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "allowSchemaEvolution": { + "type": "boolean", + "description": "If set to true schema evolution will automatically occur when writing to this DataObject with different schema, otherwise SDL will stop with error." + }, + "connectionId": { + "type": "string", + "description": "Id of JdbcConnection configuration" + }, + "jdbcOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Any jdbc options according to[[https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html]] .\nNote that some options above set and override some of this options explicitly.\nUse \\\"createTableOptions\\\" and \\\"createTableColumnTypes\\\" to control automatic creating of database tables." + }, + "virtualPartitions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Virtual partition columns. Note that this doesn\\'t need to be the same as the database partition\ncolumns for this table. But it is important that there is an index on these columns to efficiently\nlist existing \\\"partitions\\\"." + }, + "expectedPartitionsCondition": { + "type": "string", + "description": "Optional definition of partitions expected to exist.\nDefine a Spark SQL expression that is evaluated against a[[PartitionValues]] instance and returns true or false\nDefault is to expect all partitions to exist." + }, + "incrementalOutputExpr": { + "type": "string", + "description": "Optional expression to use for creating incremental output with DataObjectStateIncrementalMode.\nThe expression is used to get the high-water-mark for the incremental update state.\nNormally this can be just a column name, e.g. an id or updated timestamp which is continually increasing." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "JdbcTableDataObject", + "required": [ + "type", + "table", + "connectionId" + ], + "additionalProperties": false, + "description": "[[DataObject]]of type JDBC.\nProvides details for an action to read and write tables in a database through JDBC.\n\nNote that Sparks distributed processing can not directly write to a JDBC table in one transaction.\nJdbcTableDataObject implements this in one transaction by writing to a temporary-table with Spark,\nthen using a separate \\\"insert into ... select\\\" SQL statement to copy data into the final table.\n\nJdbcTableDataObject implements\n-[[CanMergeDataFrame]]by writing a temp table and using one SQL merge statement.\n-[[CanEvolveSchema]] by generating corresponding alter table DDL statements.\n- Overwriting partitions is implemented by using SQL delete and insert statement embedded in one transaction." + }, + "JmsDataObject": { + "type": "object", + "properties": { + "type": { + "const": "JmsDataObject" + }, + "jndiContextFactory": { + "type": "string", + "description": "JNDI Context Factory" + }, + "jndiProviderUrl": { + "type": "string", + "description": "JNDI Provider URL" + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "authentication information: for now BasicAuthMode is supported." + }, + "batchSize": { + "type": "integer", + "description": "JMS batch size" + }, + "maxWaitSec": { + "type": "integer" + }, + "maxBatchAgeSec": { + "type": "integer" + }, + "txBatchSize": { + "type": "integer" + }, + "connectionFactory": { + "type": "string", + "description": "JMS Connection Factory" + }, + "queue": { + "type": "string", + "description": "Name of MQ Queue" + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "JmsDataObject", + "required": [ + "type", + "jndiContextFactory", + "jndiProviderUrl", + "authMode", + "batchSize", + "maxWaitSec", + "maxBatchAgeSec", + "txBatchSize", + "connectionFactory", + "queue" + ], + "additionalProperties": false, + "description": "[[DataObject]] of type JMS queue.\nProvides details to an Action to access JMS queues." + }, + "JsonFileDataObject": { + "type": "object", + "properties": { + "type": { + "const": "JsonFileDataObject" + }, + "path": { + "type": "string", + "description": "The root path of the files that are handled by this DataObject.\nFor most DataObjects this can be a directory or a specific file." + }, + "jsonOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Settings for the underlying[[org.apache.spark.sql.DataFrameReader]]and\n[[org.apache.spark.sql.DataFrameWriter]] ." + }, + "partitions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Definition of partition columns" + }, + "schema": { + "type": "string", + "description": "An optional data object schema. If defined, any automatic schema inference is avoided.\nAs this corresponds to the schema on write, it must not include the optional filenameColumn on read.\nDefine the schema by using one of the schema providers DDL, jsonSchemaFile, xsdFile or caseClassName.\nThe schema provider and its configuration value must be provided in the format #.\nA DDL-formatted string is a comma separated list of field definitions, e.g., a INT, b STRING." + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "saveMode": { + "type": "string", + "description": "Overwrite or Append new data.\nWhen writing partitioned data, this applies only to partitions concerned.", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "sparkRepartition": { + "type": "object", + "properties": { + "numberOfTasksPerPartition": { + "type": "integer", + "description": "Number of Spark tasks to create per partition before writing to DataObject by repartitioning the DataFrame.\nThis controls how many files are created in each Hadoop partition." + }, + "keyCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional key columns to distribute records over Spark tasks inside a Hadoop partition.\nIf DataObject has Hadoop partitions defined, keyCols must be defined." + }, + "sortCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional columns to sort records inside files created." + }, + "filename": { + "type": "string", + "description": "Option filename to rename target file(s). If numberOfTasksPerPartition is greater than 1,\nmultiple files can exist in a directory and a number is inserted into the filename after the first \\'.\\'.\nExample: filename=data.csv -> files created are data.1.csv, data.2.csv, ..." + } + }, + "title": "SparkRepartitionDef", + "required": [ + "numberOfTasksPerPartition" + ], + "additionalProperties": false, + "description": "This controls repartitioning of the DataFrame before writing with Spark to Hadoop.\n\nWhen writing multiple partitions of a partitioned DataObject, the number of spark tasks created is equal to numberOfTasksPerPartition\nmultiplied with the number of partitions to write. To spread the records of a partition only over numberOfTasksPerPartition spark tasks,\nkeyCols must be given which are used to derive a task number inside the partition (hashvalue(keyCols) modulo numberOfTasksPerPartition).\n\nWhen writing to an unpartitioned DataObject or only one partition of a partitioned DataObject, the number of spark tasks created is equal\nto numberOfTasksPerPartition. Optional keyCols can be used to keep corresponding records together in the same task/file." + }, + "stringify": { + "type": "boolean", + "description": "Set the data type for all values to string." + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "connectionId": { + "type": "string", + "description": "Return the connection id.\n\nConnection defines path prefix (scheme, authority, base path) and ACL\\'s in central location." + }, + "filenameColumn": { + "type": "string", + "description": "The name of the (optional) additional column containing the source filename" + }, + "expectedPartitionsCondition": { + "type": "string", + "description": "Optional definition of partitions expected to exist.\nDefine a Spark SQL expression that is evaluated against a[[PartitionValues]] instance and returns true or false\nDefault is to expect all partitions to exist." + }, + "housekeepingMode": { + "oneOf": [ + { + "$ref": "#/definitions/HousekeepingMode/PartitionArchiveCompactionMode" + }, + { + "$ref": "#/definitions/HousekeepingMode/PartitionRetentionMode" + } + ], + "description": "Optional definition of a housekeeping mode applied after every write. E.g. it can be used to cleanup, archive and compact partitions.\nSee HousekeepingMode for available implementations. Default is None." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "JsonFileDataObject", + "required": [ + "type", + "path" + ], + "additionalProperties": false, + "description": "A[[io.smartdatalake.workflow.dataobject.DataObject]]backed by a JSON data source.\n\nIt manages read and write access and configurations required for[[io.smartdatalake.workflow.action.Action]]s to\nwork on JSON formatted files.\n\nReading and writing details are delegated to Apache Spark[[org.apache.spark.sql.DataFrameReader]]\nand[[org.apache.spark.sql.DataFrameWriter]] respectively.\n\nNOTE: By default, the JSON option`multiline` is enabled.\n\nSEE: [[org.apache.spark.sql.DataFrameReader]]\n\nSEE: [[org.apache.spark.sql.DataFrameWriter]]" + }, + "KafkaTopicDataObject": { + "type": "object", + "properties": { + "type": { + "const": "KafkaTopicDataObject" + }, + "topicName": { + "type": "string", + "description": "The name of the topic to read" + }, + "connectionId": { + "type": "string" + }, + "keyType": { + "type": "string", + "description": "Optional type the key column should be converted to. If none is given it will be interpreted as string.", + "enum": [ + "String ", + "Binary ", + "Json ", + "Avro ", + "JsonSchemaRegistry ", + "AvroSchemaRegistry " + ] + }, + "keySchema": { + "type": "string", + "description": "An optional schema for parsing the key column. This can be used if keyType = Json or Avro to parse the corresponding content.\nDefine the schema by using one of the schema providers DDL, jsonSchemaFile, avroSchemaFile, xsdFile or caseClassName.\nThe schema provider and its configuration value must be provided in the format #.\nA DDL-formatted string is a comma separated list of field definitions, e.g., a INT, b STRING." + }, + "valueType": { + "type": "string", + "description": "Optional type the value column should be converted to. If none is given it will be interpreted as string.", + "enum": [ + "String ", + "Binary ", + "Json ", + "Avro ", + "JsonSchemaRegistry ", + "AvroSchemaRegistry " + ] + }, + "valueSchema": { + "type": "string", + "description": "An optional schema for parsing the value column. This has to be specified if valueType = Json or Avro to parse the corresponding content.\nDefine the schema by using one of the schema providers DDL, jsonSchemaFile, avroSchemaFile, xsdFile or caseClassName.\nThe schema provider and its configuration value must be provided in the format #.\nA DDL-formatted string is a comma separated list of field definitions, e.g., a INT, b STRING." + }, + "allowSchemaEvolution": { + "type": "boolean", + "description": "If set to true schema evolution within schema registry will automatically occur when writing to this DataObject with different key or value schema, otherwise SDL will stop with error.\nThis only applies if keyType or valueType is set to Json/AvroSchemaRegistry.\nKafka Schema Evolution implementation will update schema if existing records with old schema can be read with new schema (backward compatible). Otherwise an IncompatibleSchemaException is thrown." + }, + "selectCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Columns to be selected when reading the DataFrame. Available columns are key, value, topic,\npartition, offset, timestamp, timestampType. If key/valueType is AvroSchemaRegistry the key/value column are\nconvert to a complex type according to the avro schema. To expand it select \\\"value.*\\\".\nDefault is to select key and value." + }, + "datePartitionCol": { + "type": "object", + "properties": { + "colName": { + "type": "string", + "description": "date partition column name to extract time into column on batch read" + }, + "timeFormat": { + "type": "string", + "description": "time format for timestamp in date partition column, definition according to java DateTimeFormatter. Default is \\\"yyyyMMdd\\\"." + }, + "timeUnit": { + "type": "string", + "description": "time unit for timestamp in date partition column, definition according to java ChronoUnit. Default is \\\"days\\\"." + }, + "timeZone": { + "type": "string", + "description": "time zone used for date logic. If not specified, java system default is used." + }, + "includeCurrentPartition": { + "type": "boolean", + "description": "If the current partition should be included. Default is to list only completed partitions.\nAttention: including the current partition might result in data loss if there is more data arriving.\nBut it might be useful to export all data before a scheduled maintenance." + } + }, + "title": "DatePartitionColumnDef", + "required": [ + "colName" + ], + "additionalProperties": false, + "description": "Definition of date partition column to extract formatted time into column." + }, + "batchReadConsecutivePartitionsAsRanges": { + "type": "boolean", + "description": "Set to true if consecutive partitions should be combined as one range of offsets when batch reading from topic. This results in less tasks but can be a performance problem when reading many partitions. (default=false)" + }, + "batchReadMaxOffsetsPerTask": { + "type": "integer", + "description": "Set number of offsets per Spark task when batch reading from topic." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options for the Kafka stream reader (see https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html).\nThese options override connection.options." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "KafkaTopicDataObject", + "required": [ + "type", + "topicName", + "connectionId" + ], + "additionalProperties": false, + "description": "[[DataObject]]of type KafkaTopic.\nProvides details to an action to read from Kafka Topics using either\n[[org.apache.spark.sql.DataFrameReader]]or[[org.apache.spark.sql.streaming.DataStreamReader]] \n\nKey & value schema can be automatically read from and written to confluent schema registry for Json and Avro.\nJson and Avro can also be parsed with a fixed schema.\n\nCan interpret record timestamp as SDLB partition values by setting datePartitionCol attribute. This allows to use this DataObject as input for PartitionDiffMode.\nThe DataObject does not support writing with SDLB partition values, as timestamp is autogenerated by Kafka using current time.\n\nSupport incremental output and use with DataObjectStateIncrementalMode." + }, + "PKViolatorsDataObject": { + "type": "object", + "properties": { + "type": { + "const": "PKViolatorsDataObject" + }, + "config": { + "type": "string", + "description": ": The config value can point to a configuration file or a directory containing configuration files." + }, + "flattenOutput": { + "type": "boolean", + "description": ": if true, key and data column are converted from type map to string (default)." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "PKViolatorsDataObject", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Checks for Primary Key violations for all[[DataObject]]s with Primary Keys defined that are registered in the current[[InstanceRegistry]].\nReturns the DataFrame of Primary Key violations.\n\nAlternatively, it can check for Primary Key violations of all[[DataObject]]s defined in config files. For this, the\nconfiguration \\\"config\\\" has to be set to the location of the config.\n\nExample:\n\n```\ndataObjects = {\n...\nprimarykey-violations {\ntype = PKViolatorsDataObject\nconfig = path/to/myconfiguration.conf\n}\n...\n}\n```\n\nSEE: Refer to[[ConfigLoader.loadConfigFromFilesystem()]] for details about the configuration loading." + }, + "ParquetFileDataObject": { + "type": "object", + "properties": { + "type": { + "const": "ParquetFileDataObject" + }, + "path": { + "type": "string", + "description": "Hadoop directory where this data object reads/writes it\\'s files.\nIf it doesn\\'t contain scheme and authority, the connections pathPrefix is applied. If pathPrefix is not\ndefined or doesn\\'t define scheme and authority, default schema and authority is applied.\nOptionally defined partitions are appended with hadoop standard partition layout to this path.\nOnly files ending with *.parquet* are considered as data for this DataObject." + }, + "partitions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "partition columns for this data object" + }, + "parquetOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Settings for the underlying[[org.apache.spark.sql.DataFrameReader]]and\n[[org.apache.spark.sql.DataFrameWriter]] ." + }, + "schema": { + "type": "string", + "description": "An optional schema for the spark data frame to be validated on read and write. Note: Existing Parquet files\ncontain a source schema. Therefore, this schema is ignored when reading from existing Parquet files.\nAs this corresponds to the schema on write, it must not include the optional filenameColumn on read.\nDefine the schema by using one of the schema providers DDL, jsonSchemaFile, xsdFile or caseClassName.\nThe schema provider and its configuration value must be provided in the format #.\nA DDL-formatted string is a comma separated list of field definitions, e.g., a INT, b STRING." + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "saveMode": { + "type": "string", + "description": "spark[[SaveMode]] to use when writing files, default is \\\"overwrite\\\"", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "sparkRepartition": { + "type": "object", + "properties": { + "numberOfTasksPerPartition": { + "type": "integer", + "description": "Number of Spark tasks to create per partition before writing to DataObject by repartitioning the DataFrame.\nThis controls how many files are created in each Hadoop partition." + }, + "keyCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional key columns to distribute records over Spark tasks inside a Hadoop partition.\nIf DataObject has Hadoop partitions defined, keyCols must be defined." + }, + "sortCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional columns to sort records inside files created." + }, + "filename": { + "type": "string", + "description": "Option filename to rename target file(s). If numberOfTasksPerPartition is greater than 1,\nmultiple files can exist in a directory and a number is inserted into the filename after the first \\'.\\'.\nExample: filename=data.csv -> files created are data.1.csv, data.2.csv, ..." + } + }, + "title": "SparkRepartitionDef", + "required": [ + "numberOfTasksPerPartition" + ], + "additionalProperties": false, + "description": "This controls repartitioning of the DataFrame before writing with Spark to Hadoop.\n\nWhen writing multiple partitions of a partitioned DataObject, the number of spark tasks created is equal to numberOfTasksPerPartition\nmultiplied with the number of partitions to write. To spread the records of a partition only over numberOfTasksPerPartition spark tasks,\nkeyCols must be given which are used to derive a task number inside the partition (hashvalue(keyCols) modulo numberOfTasksPerPartition).\n\nWhen writing to an unpartitioned DataObject or only one partition of a partitioned DataObject, the number of spark tasks created is equal\nto numberOfTasksPerPartition. Optional keyCols can be used to keep corresponding records together in the same task/file." + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "connectionId": { + "type": "string", + "description": "optional id of[[io.smartdatalake.workflow.connection.HadoopFileConnection]]" + }, + "filenameColumn": { + "type": "string", + "description": "The name of the (optional) additional column containing the source filename" + }, + "expectedPartitionsCondition": { + "type": "string", + "description": "Optional definition of partitions expected to exist.\nDefine a Spark SQL expression that is evaluated against a[[PartitionValues]] instance and returns true or false\nDefault is to expect all partitions to exist." + }, + "housekeepingMode": { + "oneOf": [ + { + "$ref": "#/definitions/HousekeepingMode/PartitionArchiveCompactionMode" + }, + { + "$ref": "#/definitions/HousekeepingMode/PartitionRetentionMode" + } + ], + "description": "Optional definition of a housekeeping mode applied after every write. E.g. it can be used to cleanup, archive and compact partitions.\nSee HousekeepingMode for available implementations. Default is None." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "ParquetFileDataObject", + "required": [ + "type", + "path" + ], + "additionalProperties": false, + "description": "A[[io.smartdatalake.workflow.dataobject.DataObject]]backed by an Apache Hive data source.\n\nIt manages read and write access and configurations required for[[io.smartdatalake.workflow.action.Action]]s to\nwork on Parquet formatted files.\n\nReading and writing details are delegated to Apache Spark[[org.apache.spark.sql.DataFrameReader]]\nand[[org.apache.spark.sql.DataFrameWriter]] respectively.\n\nSEE: [[org.apache.spark.sql.DataFrameReader]]\n\nSEE: [[org.apache.spark.sql.DataFrameWriter]]" + }, + "RawFileDataObject": { + "type": "object", + "properties": { + "type": { + "const": "RawFileDataObject" + }, + "path": { + "type": "string", + "description": "The root path of the files that are handled by this DataObject.\nFor most DataObjects this can be a directory or a specific file." + }, + "customFormat": { + "type": "string", + "description": "Custom Spark data source format, e.g. binaryFile or text. Only needed if you want to read/write this DataObject with Spark." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options for custom Spark data source format. Only of use if you want to read/write this DataObject with Spark." + }, + "fileName": { + "type": "string", + "description": "Definition of fileName. This is concatenated with path and partition layout to search for files. Default is an asterix to match everything." + }, + "partitions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Definition of partition columns" + }, + "schema": { + "type": "string", + "description": "An optional[[DataObject]]user-defined schema definition.\n\nSome[[DataObject]]s support optional schema inference.\nSpecifying this attribute disables automatic schema inference. When the wrapped data source contains a source\nschema, this`schema`attribute is ignored.\n\nNote: This is only used by the functionality defined in[[CanCreateDataFrame]], that is,\nwhen reading Spark data frames from the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that bypass Spark data frames ignore the`schema` attribute\nif it is defined." + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "saveMode": { + "type": "string", + "description": "Overwrite or Append new data.", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "sparkRepartition": { + "type": "object", + "properties": { + "numberOfTasksPerPartition": { + "type": "integer", + "description": "Number of Spark tasks to create per partition before writing to DataObject by repartitioning the DataFrame.\nThis controls how many files are created in each Hadoop partition." + }, + "keyCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional key columns to distribute records over Spark tasks inside a Hadoop partition.\nIf DataObject has Hadoop partitions defined, keyCols must be defined." + }, + "sortCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional columns to sort records inside files created." + }, + "filename": { + "type": "string", + "description": "Option filename to rename target file(s). If numberOfTasksPerPartition is greater than 1,\nmultiple files can exist in a directory and a number is inserted into the filename after the first \\'.\\'.\nExample: filename=data.csv -> files created are data.1.csv, data.2.csv, ..." + } + }, + "title": "SparkRepartitionDef", + "required": [ + "numberOfTasksPerPartition" + ], + "additionalProperties": false, + "description": "This controls repartitioning of the DataFrame before writing with Spark to Hadoop.\n\nWhen writing multiple partitions of a partitioned DataObject, the number of spark tasks created is equal to numberOfTasksPerPartition\nmultiplied with the number of partitions to write. To spread the records of a partition only over numberOfTasksPerPartition spark tasks,\nkeyCols must be given which are used to derive a task number inside the partition (hashvalue(keyCols) modulo numberOfTasksPerPartition).\n\nWhen writing to an unpartitioned DataObject or only one partition of a partitioned DataObject, the number of spark tasks created is equal\nto numberOfTasksPerPartition. Optional keyCols can be used to keep corresponding records together in the same task/file." + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "connectionId": { + "type": "string", + "description": "Return the connection id.\n\nConnection defines path prefix (scheme, authority, base path) and ACL\\'s in central location." + }, + "filenameColumn": { + "type": "string", + "description": "The name of the (optional) additional column containing the source filename" + }, + "expectedPartitionsCondition": { + "type": "string", + "description": "Optional definition of partitions expected to exist.\nDefine a Spark SQL expression that is evaluated against a[[PartitionValues]] instance and returns true or false\nDefault is to expect all partitions to exist." + }, + "housekeepingMode": { + "oneOf": [ + { + "$ref": "#/definitions/HousekeepingMode/PartitionArchiveCompactionMode" + }, + { + "$ref": "#/definitions/HousekeepingMode/PartitionRetentionMode" + } + ], + "description": "Optional definition of a housekeeping mode applied after every write. E.g. it can be used to cleanup, archive and compact partitions.\nSee HousekeepingMode for available implementations. Default is None." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "RawFileDataObject", + "required": [ + "type", + "path" + ], + "additionalProperties": false, + "description": "DataObject of type raw for files with unknown content.\nProvides details to an Action to access raw files.\nBy specifying format you can custom Spark data formats" + }, + "RelaxedCsvFileDataObject": { + "type": "object", + "properties": { + "type": { + "const": "RelaxedCsvFileDataObject" + }, + "path": { + "type": "string", + "description": "The root path of the files that are handled by this DataObject.\nFor most DataObjects this can be a directory or a specific file." + }, + "csvOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Settings for the underlying[[org.apache.spark.sql.DataFrameReader]]and[[org.apache.spark.sql.DataFrameWriter]] ." + }, + "partitions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Definition of partition columns" + }, + "schema": { + "type": "string", + "description": "The data object schema.\nDefine the schema by using one of the schema providers DDL, jsonSchemaFile, xsdFile or caseClassName.\nThe schema provider and its configuration value must be provided in the format #.\nA DDL-formatted string is a comma separated list of field definitions, e.g., a INT, b STRING." + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "dateColumnType": { + "type": "string", + "description": "Specifies the string format used for writing date typed data.", + "enum": [ + "Default ", + "String ", + "Date " + ] + }, + "treatMissingColumnsAsCorrupt": { + "type": "boolean", + "description": "If set to true records from files with missing columns in its header are treated as corrupt (default=false).\nCorrupt records are handled according to options.mode (default=permissive)." + }, + "treatSuperfluousColumnsAsCorrupt": { + "type": "boolean", + "description": "If set to true records from files with superfluous columns in its header are treated as corrupt (default=false).\nCorrupt records are handled according to options.mode (default=permissive)." + }, + "saveMode": { + "type": "string", + "description": "Overwrite or Append new data.\nWhen writing partitioned data, this applies only to partitions concerned.", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "sparkRepartition": { + "type": "object", + "properties": { + "numberOfTasksPerPartition": { + "type": "integer", + "description": "Number of Spark tasks to create per partition before writing to DataObject by repartitioning the DataFrame.\nThis controls how many files are created in each Hadoop partition." + }, + "keyCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional key columns to distribute records over Spark tasks inside a Hadoop partition.\nIf DataObject has Hadoop partitions defined, keyCols must be defined." + }, + "sortCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional columns to sort records inside files created." + }, + "filename": { + "type": "string", + "description": "Option filename to rename target file(s). If numberOfTasksPerPartition is greater than 1,\nmultiple files can exist in a directory and a number is inserted into the filename after the first \\'.\\'.\nExample: filename=data.csv -> files created are data.1.csv, data.2.csv, ..." + } + }, + "title": "SparkRepartitionDef", + "required": [ + "numberOfTasksPerPartition" + ], + "additionalProperties": false, + "description": "This controls repartitioning of the DataFrame before writing with Spark to Hadoop.\n\nWhen writing multiple partitions of a partitioned DataObject, the number of spark tasks created is equal to numberOfTasksPerPartition\nmultiplied with the number of partitions to write. To spread the records of a partition only over numberOfTasksPerPartition spark tasks,\nkeyCols must be given which are used to derive a task number inside the partition (hashvalue(keyCols) modulo numberOfTasksPerPartition).\n\nWhen writing to an unpartitioned DataObject or only one partition of a partitioned DataObject, the number of spark tasks created is equal\nto numberOfTasksPerPartition. Optional keyCols can be used to keep corresponding records together in the same task/file." + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "connectionId": { + "type": "string", + "description": "Return the connection id.\n\nConnection defines path prefix (scheme, authority, base path) and ACL\\'s in central location." + }, + "filenameColumn": { + "type": "string", + "description": "The name of the (optional) additional column containing the source filename" + }, + "expectedPartitionsCondition": { + "type": "string", + "description": "Optional definition of partitions expected to exist.\nDefine a Spark SQL expression that is evaluated against a[[PartitionValues]] instance and returns true or false\nDefault is to expect all partitions to exist." + }, + "housekeepingMode": { + "oneOf": [ + { + "$ref": "#/definitions/HousekeepingMode/PartitionArchiveCompactionMode" + }, + { + "$ref": "#/definitions/HousekeepingMode/PartitionRetentionMode" + } + ], + "description": "Optional definition of a housekeeping mode applied after every write. E.g. it can be used to cleanup, archive and compact partitions.\nSee HousekeepingMode for available implementations. Default is None." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "RelaxedCsvFileDataObject", + "required": [ + "type", + "path" + ], + "additionalProperties": false, + "description": "A[[DataObject]] which allows for more flexible CSV parsing.\nThe standard CsvFileDataObject doesnt support reading multiple CSV-Files with different column order, missing columns\nor additional columns.\nRelaxCsvFileDataObject works more like reading JSON-Files. You need to define a schema, then it tries to read every file\nwith that schema independently of the column order, adding missing columns and removing superfluous ones.\n\nCSV files are read by Spark as whole text files and then parsed manually with Sparks CSV parser class. You can therefore use the\nnormal CSV options of spark, but some properties are fixed, e.g. header=true, inferSchema=false, enforceSchema (ignored).\n\nNOTE: This data object sets the following default values for`csvOptions`: delimiter = \\\",\\\", quote = null\nAll other`csvOption` default to the values defined by Apache Spark.\n\nSEE: [[org.apache.spark.sql.DataFrameReader]]\n\nSEE: [[org.apache.spark.sql.DataFrameWriter]] \n\nIf mode is permissive you can retrieve the corrupt input record by adding as field to the schema.\nRelaxCsvFileDataObject also supports getting an error msg by adding \\\"_msg\\\" as field to the schema." + }, + "SFtpFileRefDataObject": { + "type": "object", + "properties": { + "type": { + "const": "SFtpFileRefDataObject" + }, + "path": { + "type": "string", + "description": "The root path of the files that are handled by this DataObject.\nFor most DataObjects this can be a directory or a specific file." + }, + "connectionId": { + "type": "string" + }, + "partitions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Definition of partition columns" + }, + "partitionLayout": { + "type": "string", + "description": "partition layout defines how partition values can be extracted from the path.\nUse \\\"%%\\\" as token to extract the value for a partition column.\nAs partition layout extracts partition from the path of individual files, it can also be used to extract partitions from the file name.\nWith \\\"%%\\\" a regex can be given to limit search. This is especially useful\nif there is no char to delimit the last token from the rest of the path or also between\ntwo tokens.\nBe careful that for directory based partition values extraction, the final path separator must be part\nof the partition layout to extract the last token correctly, e.g. \\\"%year%/\\\" for partitioning with yearly directories." + }, + "saveMode": { + "type": "string", + "description": "Overwrite or Append new data.", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "expectedPartitionsCondition": { + "type": "string", + "description": "Optional definition of partitions expected to exist.\nDefine a Spark SQL expression that is evaluated against a[[PartitionValues]] instance and returns true or false\nDefault is to expect all partitions to exist." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "SFtpFileRefDataObject", + "required": [ + "type", + "path", + "connectionId" + ], + "additionalProperties": false, + "description": "Connects to SFtp files\nNeeds java library \\\"com.hieronymus % sshj % 0.21.1\\\"\nThe following authentication mechanisms are supported\n-> public/private-key: private key must be saved in ~/.ssh, public key must be registered on server.\n-> user/pwd authentication: user and password is taken from two variables set as parameters.\nThese variables could come from clear text (CLEAR), a file (FILE) or an environment variable (ENV)" + }, + "SnowflakeTableDataObject": { + "type": "object", + "properties": { + "type": { + "const": "SnowflakeTableDataObject" + }, + "table": { + "$ref": "#/definitions/Others/Table" + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "constraints": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "name of the constraint" + }, + "description": { + "type": "string", + "description": "optional detailed description of the constraint" + }, + "expression": { + "type": "string", + "description": "SQL expression to evaluate on every row. The expressions return value should be a boolean.\nIf it evaluates to true the constraint is validated successfully, otherwise it will throw an exception." + }, + "errorMsgCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional list of column names to add to error message.\nNote that primary key columns are always included.\nIf there is no primary key defined, by default all columns with simple datatype are included in the error message." + } + }, + "title": "Constraint", + "required": [ + "name", + "expression" + ], + "additionalProperties": false, + "description": "Definition of row-level constraint to validate." + }, + "description": "List of row-level[[Constraint]] s to enforce when writing to this data object." + }, + "expectations": { + "type": "array", + "items": { + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "const": "AvgCountPerPartitionExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'> 100000\\\".\nIf no expectation is defined, the result value is just recorded in metrics." + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "AvgCountPerPartitionExpectation", + "required": [ + "name", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on the average number of records per partitions.\n\nNote that the scope for evaluating this expectation is fixed to Job." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLQueryExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "code": { + "type": "string", + "description": "a SQL query returning a single row. All column will be added as metrics.\nIf there are more than one column, there has to be one column with the same name as this expectation. This column will be used to compare against a potential condition of the expectation.\nThe special token %{inputViewName} must be used to insert the temporary view name used to provide the DataFrame to the query." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'= 0\\\".\nTogether with the result of the aggExpression evaluation on the left side, it forms the condition to validate the expectation.\nIf no expectation is defined, the aggExpression evaluation result is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLQueryExpectation", + "required": [ + "name", + "code", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on a SQL query to be evaluate on dataset-level.\nThe SQL query will be evaluated in a separate Spark job against the DataFrame." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "aggExpression": { + "type": "string", + "description": "SQL aggregate expression to evaluate on dataset, e.g. count(*)." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'= 0\\\".\nTogether with the result of the aggExpression evaluation on the left side, it forms the condition to validate the expectation.\nIf no expectation is defined, the aggExpression evaluation result is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLExpectation", + "required": [ + "name", + "aggExpression", + "type" + ], + "additionalProperties": false, + "description": "Definition of expectation based on a SQL aggregate expression to evaluate on dataset-level." + }, + { + "type": "object", + "properties": { + "type": { + "const": "ExpectationDefaultImpl" + } + }, + "title": "ExpectationDefaultImpl", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Default implementation for getValidationErrorColumn for metric of type`any` ." + }, + { + "type": "object", + "properties": { + "type": { + "const": "CountExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'> 100000\\\".\nIf no expectation is defined, the result value is is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "CountExpectation", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on the number of records." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLFractionExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "countConditionExpression": { + "type": "string", + "description": "SQL expression returning a boolean to match the rows to count for the fraction." + }, + "globalConditionExpression": { + "type": "string", + "description": "SQL expression returning a boolean used as global filter, e.g. fraction row count and total row count are filtered with global filter before counting." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected percentage for validation, e.g. \\'= 0.9\\\".\nIf no expectation is defined, the result value is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLFractionExpectation", + "required": [ + "name", + "countConditionExpression", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on counting how many rows match an expression vs the number of all rows.\nThe fraction of these two counts is compared against a given expectation." + } + ] + }, + "description": "List of[[Expectation]] s to enforce when writing to this data object. Expectations are checks based on aggregates over all rows of a dataset." + }, + "saveMode": { + "type": "string", + "description": "spark[[SDLSaveMode]] to use when writing files, default is \\\"overwrite\\\"", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "connectionId": { + "type": "string", + "description": "The SnowflakeTableConnection to use for the table" + }, + "comment": { + "type": "string", + "description": "An optional comment to add to the table after writing a DataFrame to it" + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "SnowflakeTableDataObject", + "required": [ + "type", + "table", + "connectionId" + ], + "additionalProperties": false, + "description": "[[DataObject]] of type SnowflakeTableDataObject.\nProvides details to access Snowflake tables via an action\nCan be used both for interacting with Snowflake through Spark with JDBC,\nas well as for actions written in the Snowpark API that run directly on Snowflake" + }, + "SplunkDataObject": { + "type": "object", + "properties": { + "type": { + "const": "SplunkDataObject" + }, + "params": { + "type": "object", + "properties": { + "query": { + "type": "string" + }, + "queryFrom": { + "type": "string", + "existingJavaType": "java.time.LocalDateTime" + }, + "queryTo": { + "type": "string", + "existingJavaType": "java.time.LocalDateTime" + }, + "queryTimeInterval": { + "type": "string", + "existingJavaType": "java.time.Duration" + }, + "columnNames": { + "type": "array", + "items": { + "type": "string" + } + }, + "parallelRequests": { + "type": "integer" + } + }, + "title": "SplunkParams", + "required": [ + "query", + "queryFrom", + "queryTo" + ], + "additionalProperties": false + }, + "connectionId": { + "type": "string" + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "SplunkDataObject", + "required": [ + "type", + "params", + "connectionId" + ], + "additionalProperties": false, + "description": "[[DataObject]] of type Splunk.\nProvides details to an action to access Splunk logs." + }, + "TickTockHiveTableDataObject": { + "type": "object", + "properties": { + "type": { + "const": "TickTockHiveTableDataObject" + }, + "path": { + "type": "string" + }, + "partitions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Definition of partition columns" + }, + "analyzeTableAfterWrite": { + "type": "boolean" + }, + "dateColumnType": { + "type": "string", + "enum": [ + "Default ", + "String ", + "Date " + ] + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "table": { + "$ref": "#/definitions/Others/Table" + }, + "constraints": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "name of the constraint" + }, + "description": { + "type": "string", + "description": "optional detailed description of the constraint" + }, + "expression": { + "type": "string", + "description": "SQL expression to evaluate on every row. The expressions return value should be a boolean.\nIf it evaluates to true the constraint is validated successfully, otherwise it will throw an exception." + }, + "errorMsgCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional list of column names to add to error message.\nNote that primary key columns are always included.\nIf there is no primary key defined, by default all columns with simple datatype are included in the error message." + } + }, + "title": "Constraint", + "required": [ + "name", + "expression" + ], + "additionalProperties": false, + "description": "Definition of row-level constraint to validate." + }, + "description": "List of constraint definitions to validate on write, see[[Constraint]] for details.\nConstraints are expressions defined on row-level and validated during evaluation of the DataFrame.\nIf validation fails an exception is thrown and further processing is stopped.\nNote that this is done while evaluating the DataFrame when writing to the DataObject. It doesn\\'t need a separate action on the DataFrame.\nIf a constraint validation for a row fails, it will throw an exception and abort writing to the DataObject." + }, + "expectations": { + "type": "array", + "items": { + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "const": "AvgCountPerPartitionExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'> 100000\\\".\nIf no expectation is defined, the result value is just recorded in metrics." + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "AvgCountPerPartitionExpectation", + "required": [ + "name", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on the average number of records per partitions.\n\nNote that the scope for evaluating this expectation is fixed to Job." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLQueryExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "code": { + "type": "string", + "description": "a SQL query returning a single row. All column will be added as metrics.\nIf there are more than one column, there has to be one column with the same name as this expectation. This column will be used to compare against a potential condition of the expectation.\nThe special token %{inputViewName} must be used to insert the temporary view name used to provide the DataFrame to the query." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'= 0\\\".\nTogether with the result of the aggExpression evaluation on the left side, it forms the condition to validate the expectation.\nIf no expectation is defined, the aggExpression evaluation result is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLQueryExpectation", + "required": [ + "name", + "code", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on a SQL query to be evaluate on dataset-level.\nThe SQL query will be evaluated in a separate Spark job against the DataFrame." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "aggExpression": { + "type": "string", + "description": "SQL aggregate expression to evaluate on dataset, e.g. count(*)." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'= 0\\\".\nTogether with the result of the aggExpression evaluation on the left side, it forms the condition to validate the expectation.\nIf no expectation is defined, the aggExpression evaluation result is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLExpectation", + "required": [ + "name", + "aggExpression", + "type" + ], + "additionalProperties": false, + "description": "Definition of expectation based on a SQL aggregate expression to evaluate on dataset-level." + }, + { + "type": "object", + "properties": { + "type": { + "const": "ExpectationDefaultImpl" + } + }, + "title": "ExpectationDefaultImpl", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Default implementation for getValidationErrorColumn for metric of type`any` ." + }, + { + "type": "object", + "properties": { + "type": { + "const": "CountExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected value for validation, e.g. \\'> 100000\\\".\nIf no expectation is defined, the result value is is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "CountExpectation", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on the number of records." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLFractionExpectation" + }, + "name": { + "type": "string", + "description": "The name of the expectation" + }, + "description": { + "type": "string", + "description": "Optional detailed description of the expectation" + }, + "countConditionExpression": { + "type": "string", + "description": "SQL expression returning a boolean to match the rows to count for the fraction." + }, + "globalConditionExpression": { + "type": "string", + "description": "SQL expression returning a boolean used as global filter, e.g. fraction row count and total row count are filtered with global filter before counting." + }, + "expectation": { + "type": "string", + "description": "Optional SQL comparison operator and literal to define expected percentage for validation, e.g. \\'= 0.9\\\".\nIf no expectation is defined, the result value is just recorded in metrics." + }, + "scope": { + "type": "string", + "description": "The aggregation scope used to evaluate the aggregate expression.\nDefault is \\'Job\\', which evaluates the records transformed by the current job. This is implemented without big performance impacts on Spark.\nOther options are \\'All\\' and \\'JobPartitions\\', which are implemented by reading the output data again.", + "enum": [ + "All ", + "JobPartition ", + "Job " + ] + }, + "failedSeverity": { + "type": "string", + "description": "Severity if expectation fails - can be Error (default) or Warn.\nIf set to Error, execution will fail, otherwise there will be just a warning logged.", + "enum": [ + "Warn ", + "Error " + ] + } + }, + "title": "SQLFractionExpectation", + "required": [ + "name", + "countConditionExpression", + "type" + ], + "additionalProperties": false, + "description": "Definition of an expectation based on counting how many rows match an expression vs the number of all rows.\nThe fraction of these two counts is compared against a given expectation." + } + ] + }, + "description": "Map of expectation name and definition to evaluate on write, see[[Expectation]] for details.\nExpectations are aggregation expressions defined on dataset-level and evaluated on every write.\nBy default their result is logged with level info (ok) and error (failed), but this can be customized to be logged as warning.\nIn case of failed expectations logged as error, an exceptions is thrown and further processing is stopped.\nNote that the exception is thrown after writing to the DataObject is finished.\n\nThe following expectations names are reserved to create default metrics and should not be used:\n- count" + }, + "numInitialHdfsPartitions": { + "type": "integer" + }, + "saveMode": { + "type": "string", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "connectionId": { + "type": "string" + }, + "expectedPartitionsCondition": { + "type": "string", + "description": "Definition of partitions that are expected to exists.\nThis is used to validate that partitions being read exists and don\\'t return no data.\nDefine a Spark SQL expression that is evaluated against a[[PartitionValues]] instance and returns true or false\nexample: \\\"elements[\\'yourColName\\'] > 2017\\\"\n\nOTHERTAG: true if partition is expected to exist." + }, + "housekeepingMode": { + "oneOf": [ + { + "$ref": "#/definitions/HousekeepingMode/PartitionArchiveCompactionMode" + }, + { + "$ref": "#/definitions/HousekeepingMode/PartitionRetentionMode" + } + ], + "description": "Configure a housekeeping mode to e.g cleanup, archive and compact partitions.\nDefault is None." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "TickTockHiveTableDataObject", + "required": [ + "type", + "table" + ], + "additionalProperties": false + }, + "WebserviceFileDataObject": { + "type": "object", + "properties": { + "type": { + "const": "WebserviceFileDataObject" + }, + "url": { + "type": "string" + }, + "additionalHeaders": { + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "timeouts": { + "type": "object", + "properties": { + "connectionTimeoutMs": { + "type": "integer" + }, + "readTimeoutMs": { + "type": "integer" + } + }, + "title": "HttpTimeoutConfig", + "required": [ + "connectionTimeoutMs", + "readTimeoutMs" + ], + "additionalProperties": false + }, + "readTimeoutMs": { + "type": "integer" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ] + }, + "mimeType": { + "type": "string" + }, + "writeMethod": { + "type": "string", + "enum": [ + "Delete ", + "Put ", + "Post ", + "Get " + ] + }, + "proxy": { + "type": "object", + "properties": { + "host": { + "type": "string" + }, + "port": { + "type": "integer" + } + }, + "title": "HttpProxyConfig", + "required": [ + "host", + "port" + ], + "additionalProperties": false + }, + "followRedirects": { + "type": "boolean" + }, + "partitionDefs": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "values": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "title": "WebservicePartitionDefinition", + "required": [ + "name", + "values" + ], + "additionalProperties": false + }, + "description": "list of partitions with list of possible values for every entry" + }, + "partitionLayout": { + "type": "string", + "description": "definition of partitions in query string. Use %% as placeholder for partition column value in layout." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "WebserviceFileDataObject", + "required": [ + "type", + "url" + ], + "additionalProperties": false, + "description": "[[DataObject]] to call webservice and return response as InputStream\nThis is implemented as FileRefDataObject because the response is treated as some file content.\nFileRefDataObjects support partitioned data. For a WebserviceFileDataObject partitions are mapped as query parameters to create query string.\nAll possible query parameter values must be given in configuration." + }, + "XmlFileDataObject": { + "type": "object", + "properties": { + "type": { + "const": "XmlFileDataObject" + }, + "path": { + "type": "string", + "description": "The root path of the files that are handled by this DataObject.\nFor most DataObjects this can be a directory or a specific file." + }, + "rowTag": { + "type": "string" + }, + "xmlOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Settings for the underlying[[org.apache.spark.sql.DataFrameReader]]and[[org.apache.spark.sql.DataFrameWriter]] ." + }, + "partitions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Definition of partition columns" + }, + "schema": { + "type": "string", + "description": "An optional data object schema. If defined, any automatic schema inference is avoided.\nAs this corresponds to the schema on write, it must not include the optional filenameColumn on read.\nDefine the schema by using one of the schema providers DDL, jsonSchemaFile, xsdFile or caseClassName.\nThe schema provider and its configuration value must be provided in the format #.\nA DDL-formatted string is a comma separated list of field definitions, e.g., a INT, b STRING." + }, + "schemaMin": { + "type": "string", + "description": "An optional, minimal schema that a[[DataObject]]schema must have to pass schema validation.\n\nThe schema validation semantics are:\n- Schema A is valid in respect to a minimal schema B when B is a subset of A. This means: the whole column set of B is contained in the column set of A.\n- A column of B is contained in A when A contains a column with equal name and data type.\n- Column order is ignored.\n- Column nullability is ignored.\n- Duplicate columns in terms of name and data type are eliminated (set semantics).\n\nNote: This is mainly used by the functionality defined in[[CanCreateDataFrame]]and[[CanWriteDataFrame]], that is,\nwhen reading or writing Spark data frames from/to the underlying data container.\n[[io.smartdatalake.workflow.action.Action]]s that work with files ignore the`schemaMin` attribute\nif it is defined.\nAdditionally schemaMin can be used to define the schema used if there is no data or table doesn\\'t yet exist." + }, + "saveMode": { + "type": "string", + "description": "Overwrite or Append new data.\nWhen writing partitioned data, this applies only to partitions concerned.", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + }, + "sparkRepartition": { + "type": "object", + "properties": { + "numberOfTasksPerPartition": { + "type": "integer", + "description": "Number of Spark tasks to create per partition before writing to DataObject by repartitioning the DataFrame.\nThis controls how many files are created in each Hadoop partition." + }, + "keyCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional key columns to distribute records over Spark tasks inside a Hadoop partition.\nIf DataObject has Hadoop partitions defined, keyCols must be defined." + }, + "sortCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional columns to sort records inside files created." + }, + "filename": { + "type": "string", + "description": "Option filename to rename target file(s). If numberOfTasksPerPartition is greater than 1,\nmultiple files can exist in a directory and a number is inserted into the filename after the first \\'.\\'.\nExample: filename=data.csv -> files created are data.1.csv, data.2.csv, ..." + } + }, + "title": "SparkRepartitionDef", + "required": [ + "numberOfTasksPerPartition" + ], + "additionalProperties": false, + "description": "This controls repartitioning of the DataFrame before writing with Spark to Hadoop.\n\nWhen writing multiple partitions of a partitioned DataObject, the number of spark tasks created is equal to numberOfTasksPerPartition\nmultiplied with the number of partitions to write. To spread the records of a partition only over numberOfTasksPerPartition spark tasks,\nkeyCols must be given which are used to derive a task number inside the partition (hashvalue(keyCols) modulo numberOfTasksPerPartition).\n\nWhen writing to an unpartitioned DataObject or only one partition of a partitioned DataObject, the number of spark tasks created is equal\nto numberOfTasksPerPartition. Optional keyCols can be used to keep corresponding records together in the same task/file." + }, + "flatten": { + "type": "boolean" + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "connectionId": { + "type": "string", + "description": "Return the connection id.\n\nConnection defines path prefix (scheme, authority, base path) and ACL\\'s in central location." + }, + "filenameColumn": { + "type": "string", + "description": "The name of the (optional) additional column containing the source filename" + }, + "expectedPartitionsCondition": { + "type": "string", + "description": "Optional definition of partitions expected to exist.\nDefine a Spark SQL expression that is evaluated against a[[PartitionValues]] instance and returns true or false\nDefault is to expect all partitions to exist." + }, + "housekeepingMode": { + "oneOf": [ + { + "$ref": "#/definitions/HousekeepingMode/PartitionArchiveCompactionMode" + }, + { + "$ref": "#/definitions/HousekeepingMode/PartitionRetentionMode" + } + ], + "description": "Optional definition of a housekeeping mode applied after every write. E.g. it can be used to cleanup, archive and compact partitions.\nSee HousekeepingMode for available implementations. Default is None." + }, + "metadata": { + "$ref": "#/definitions/Others/DataObjectMetadata" + } + }, + "title": "XmlFileDataObject", + "required": [ + "type", + "path" + ], + "additionalProperties": false, + "description": "A[[DataObject]]backed by an XML data source.\n\nIt manages read and write access and configurations required for[[Action]]s to\nwork on XML formatted files.\n\nReading and writing details are delegated to Apache Spark[[org.apache.spark.sql.DataFrameReader]]\nand[[org.apache.spark.sql.DataFrameWriter]]respectively. The reader and writer implementations are provided by\nthe[[https://github.com/databricks/spark-xml databricks spark-xml]] project.\nNote that writing XML-file partitioned is not supported by spark-xml.\n\nSEE: [[org.apache.spark.sql.DataFrameReader]]\n\nSEE: [[org.apache.spark.sql.DataFrameWriter]]" + } + }, + "SaveModeOptions": { + "SaveModeGenericOptions": { + "type": "object", + "properties": { + "type": { + "const": "SaveModeGenericOptions" + }, + "saveMode": { + "type": "string", + "enum": [ + "Merge ", + "OverwriteOptimized ", + "OverwritePreserveDirectories ", + "Ignore ", + "ErrorIfExists ", + "Append ", + "Overwrite " + ] + } + }, + "title": "SaveModeGenericOptions", + "required": [ + "type", + "saveMode" + ], + "additionalProperties": false, + "description": "This class can be used to override save mode without further special parameters." + }, + "SaveModeMergeOptions": { + "type": "object", + "properties": { + "type": { + "const": "SaveModeMergeOptions" + }, + "deleteCondition": { + "type": "string", + "description": "A condition to control if matched records are deleted. If no condition is given, *no* records are delete." + }, + "updateCondition": { + "type": "string", + "description": "A condition to control if matched records are updated. If no condition is given all matched records are updated (default).\nNote that delete is applied before update. Records selected for deletion are automatically excluded from the updates." + }, + "updateColumns": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of column names to update in update clause. If empty all columns (except primary keys) are updated (default)" + }, + "insertCondition": { + "type": "string", + "description": "A condition to control if unmatched records are inserted. If no condition is given all unmatched records are inserted (default)." + }, + "insertColumnsToIgnore": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of column names to ignore in insert clause. If empty all columns are inserted (default)." + }, + "insertValuesOverride": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Optional Map of column name and value expression to override value on insert. Value expressions have to be a sql expression string, e.g. true or \\'abc\\'." + }, + "additionalMergePredicate": { + "type": "string", + "description": "To optimize performance for SDLSaveMode.Merge it might be interesting to limit the records read from the existing table data, e.g. merge operation might use only the last 7 days." + } + }, + "title": "SaveModeMergeOptions", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Options to control detailed behaviour of SaveMode.Merge.\nIn Spark expressions use table alias \\'existing\\' to reference columns of the existing table data, and table alias \\'new\\' to reference columns of new data set." + } + }, + "ParsableScriptDef": { + "CmdScript": { + "type": "object", + "properties": { + "type": { + "const": "CmdScript" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "winCmd": { + "type": "string", + "description": "Cmd to execute on windows operating systems - note that it is executed with \\\"cmd /C\\\" prefixed" + }, + "linuxCmd": { + "type": "string", + "description": "Cmd to execute on linux operating systems - note that it is executed with \\\"sh -c\\\" prefixed." + } + }, + "title": "CmdScript", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Execute a command on the command line and get its std output\nCommand can be different for windows and linux operating systems, but it must be defined for at least one of them.\n\nIf return value is not zero an exception is thrown.\n\nNote about internal implementation: on execution value of parameter map entries where key starts with\n- \\'param\\' will be added as parameter after the docker run command, sorted by key.\nThis allows to customize execution behaviour through Actions or DataObjects using CmdScript." + }, + "DockerRunScript": { + "type": "object", + "properties": { + "type": { + "const": "DockerRunScript" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "image": { + "type": "string", + "description": "Docker image to run" + }, + "winDockerCmd": { + "type": "string", + "description": "Cmd to execute docker on windows operating systems. Default is \\'docker\\'." + }, + "linuxDockerCmd": { + "type": "string", + "description": "Cmd to execute docker on linux operating systems. Default is \\'docker\\'." + }, + "localDataDirToMount": { + "type": "string", + "description": "Optional directory that will be mounted as /mnt/data in the container. This is needed if your container wants to access files available in your local filesystem." + } + }, + "title": "DockerRunScript", + "required": [ + "type", + "image" + ], + "additionalProperties": false, + "description": "Run a docker image and get its std output.\n\nIf return value is not zero an exception is thrown.\n\nNote about internal implementation: on execution value of parameter map entries where key starts with\n- \\'runParam\\' will be added as parameter after the docker run command, sorted by their key.\n- \\'dockerParam\\' will be added as parameter for the docker command, e.g. before the image name in the docker run command, sorted by their key.\nThis allows to customize execution behaviour through Actions or DataObjects using CmdScript." + } + }, + "Others": { + "ActionMetadata": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "Readable name of the Action" + }, + "description": { + "type": "string", + "description": "Description of the content of the Action" + }, + "feed": { + "type": "string", + "description": "Name of the feed this Action belongs to" + }, + "tags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional custom tags for this object" + } + }, + "title": "ActionMetadata", + "additionalProperties": false, + "description": "Additional metadata for an Action" + }, + "ConnectionMetadata": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "Readable name of the Connection" + }, + "description": { + "type": "string", + "description": "Description of the content of the Connection" + }, + "layer": { + "type": "string", + "description": "Name of the layer this Connection belongs to" + }, + "subjectArea": { + "type": "string", + "description": "Name of the subject area this Connection belongs to" + }, + "tags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional custom tags for this object" + } + }, + "title": "ConnectionMetadata", + "additionalProperties": false, + "description": "Additional metadata for a Connection" + }, + "DataObjectMetadata": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "Readable name of the DataObject" + }, + "description": { + "type": "string", + "description": "Description of the content of the DataObject" + }, + "layer": { + "type": "string", + "description": "Name of the layer this DataObject belongs to" + }, + "subjectArea": { + "type": "string", + "description": "Name of the subject area this DataObject belongs to" + }, + "tags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional custom tags for this object" + } + }, + "title": "DataObjectMetadata", + "additionalProperties": false, + "description": "Additional metadata for a DataObject" + }, + "Table": { + "type": "object", + "properties": { + "db": { + "type": "string", + "description": "database-schema to be used for this table.\n If there exists a connection for the DataObject and this field is not defined, it will be set to the connections database value .\nCalled db for backwards-compatibility because for hive tables, db and schema mean the same thing." + }, + "name": { + "type": "string", + "description": "table name" + }, + "query": { + "type": "string", + "description": "optional select query" + }, + "primaryKey": { + "type": "array", + "items": { + "type": "string" + }, + "description": "optional sequence of primary key columns" + }, + "foreignKeys": { + "type": "array", + "items": { + "type": "object", + "properties": { + "db": { + "type": "string", + "description": "target database, if not defined it is assumed to be the same as the table owning the foreign key" + }, + "table": { + "type": "string", + "description": "referenced target table name" + }, + "columns": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "mapping of source column(s) to referenced target table column(s). The map is given\nas a list of objects with the following syntax: {\\\"local_column_name\\\" : \\\"external_column_name\\\"}" + }, + "name": { + "type": "string", + "description": "optional name for foreign key, e.g to depict it\\'s role.\n\n\nForeign keys in .conf files are to be defined like the following example \n(here two foreign key objects): \nforeignKeys = [\n{\ndb = \\\"OPTIONAL_DB_name\\\"\ntable = \\\"table_id\\\"\ncolumns = {\n\\\"local_column_name\\\": \\\"external_column_name\\\"\n}\nname = \\\"OPTIONAL_key_name\\\"\n},\n{\ntable = \\\"another_table_id\\\"\ncolumns = {\n\\\"another_local_column_name\\\": \\\"another_external_column_name\\\"\n}\nname = \\\"another_OPTIONAL_key_name\\\"\n}\n]" + } + }, + "title": "ForeignKey", + "required": [ + "table", + "columns" + ], + "additionalProperties": false, + "description": "Foreign key definition." + }, + "description": "optional sequence of foreign key definitions.\nThis is used as metadata for a data catalog.\nEach foreign key in the .conf files is an object with the following properties: \n{db: string, table: string , name: string map: Map[String]}, whereas a Map[String] is simply \na further object of the type {:string, :string}. For example: \nforeignKeys = [\n{\ndb = \\\"OPTIONAL_DB_name\\\" \ntable = \\\"table_id\\\" \ncolumns = { \n\\\"local_column_name\\\": \\\"external_column_name\\\" \n} \nname = \\\"OPTIONAL_key_name\\\" \n} \n]" + }, + "catalog": { + "type": "string", + "description": "Optional catalog to be used for this table. If null default catalog is used.\nIf there exists a connection with catalog value for the DataObject and this field is not defined, it will be set to the connections catalog value." + } + }, + "title": "Table", + "required": [ + "name" + ], + "additionalProperties": false, + "description": "Table attributes" + } + }, + "GenericDfTransformer": { + "AdditionalColumnsTransformer": { + "type": "object", + "properties": { + "type": { + "const": "AdditionalColumnsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "additionalColumns": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [column name, spark sql expression] to be added as additional columns to the dataframe.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]]and added to the DataFrame as literal columns.\n[[DefaultExpressionData]] contains informations from the context of the SDLB job, like runId or feed name." + }, + "additionalDerivedColumns": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [column name, spark sql expression] to be added as additional columns to the dataframe.\nThe spark sql expressions are evaluated against the input DataFrame and added to the DataFrame as derived columns." + } + }, + "title": "AdditionalColumnsTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Add additional columns to the DataFrame by extracting information from the context or derived from input columns." + }, + "BlacklistTransformer": { + "type": "object", + "properties": { + "type": { + "const": "BlacklistTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "columnBlacklist": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of columns to exclude from DataFrame" + } + }, + "title": "BlacklistTransformer", + "required": [ + "type", + "columnBlacklist" + ], + "additionalProperties": false, + "description": "Apply a column blacklist to a DataFrame." + }, + "DataValidationTransformer": { + "type": "object", + "properties": { + "type": { + "const": "DataValidationTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "rules": { + "type": "array", + "items": { + "$ref": "#/definitions/ValidationRule/RowLevelValidationRule" + }, + "description": "list of validation rules to apply to the DataFrame" + }, + "errorsColumn": { + "type": "string", + "description": "Optional column name for the list of error messages. Default is \\\"errors\\\"." + }, + "subFeedTypeForValidation": { + "type": "string", + "description": "For validating the rule expression, the runtime subFeedType is not yet known.\nBy default SparkSubFeed langauge is used, but you can configure a different one if needed." + } + }, + "title": "DataValidationTransformer", + "required": [ + "type", + "rules" + ], + "additionalProperties": false, + "description": "Apply validation rules to a DataFrame and collect potential violation error messages in a new column." + }, + "DecryptColumnsTransformer": { + "type": "object", + "properties": { + "type": { + "const": "DecryptColumnsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "decryptColumns": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of columns [columnA, columnB] to be encrypted" + }, + "keyVariable": { + "type": "string", + "description": "contains the id of the provider and the name of the secret with format #,\ne.g. ENV# to get a secret from an environment variable OR CLEAR#mYsEcReTkeY", + "deprecated": true + }, + "key": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "algorithm": { + "type": "string" + } + }, + "title": "DecryptColumnsTransformer", + "required": [ + "type", + "decryptColumns" + ], + "additionalProperties": false, + "description": "Decryption of specified columns using AES/GCM algorithm." + }, + "EncryptColumnsTransformer": { + "type": "object", + "properties": { + "type": { + "const": "EncryptColumnsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "encryptColumns": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of columns [columnA, columnB] to be encrypted" + }, + "keyVariable": { + "type": "string", + "description": "contains the id of the provider and the name of the secret with format #,\ne.g. ENV# to get a secret from an environment variable OR CLEAR#mYsEcReTkeY", + "deprecated": true + }, + "key": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + } + }, + "title": "EncryptColumnsTransformer", + "required": [ + "type", + "encryptColumns" + ], + "additionalProperties": false, + "description": "Encryption of specified columns using AES/GCM algorithm." + }, + "FilterTransformer": { + "type": "object", + "properties": { + "type": { + "const": "FilterTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "filterClause": { + "type": "string", + "description": "Spark SQL expression to filter the DataFrame" + }, + "subFeedTypeForValidation": { + "type": "string", + "description": "When parsing the configuration the runtime subFeedType for validating the filter expression is not yet known.\nBy default SparkSubFeed langauge is used, but you can configure a different one if needed." + } + }, + "title": "FilterTransformer", + "required": [ + "type", + "filterClause" + ], + "additionalProperties": false, + "description": "Apply a filter condition to a DataFrame." + }, + "PythonCodeDfTransformer": { + "type": "object", + "properties": { + "type": { + "const": "PythonCodeDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "code": { + "type": "string", + "description": "Optional python code to user for python transformation. The python code can use variables inputDf, dataObjectId and options. The transformed DataFrame has to be set with setOutputDf." + }, + "file": { + "type": "string", + "description": "Optional file with python code to use for python transformation. The python code can use variables inputDf, dataObjectId and options. The transformed DataFrame has to be set with setOutputDf." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "PythonCodeDfTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1) as Python/PySpark code.\nNote that this transformer needs a Python and PySpark environment installed.\nPySpark session is initialize and available under variables`sc`,`session`,`sqlContext`.\nOther variables available are\n-`inputDf`: Input DataFrame\n-`options`: Transformation options as Map[String,String]\n-`dataObjectId`: Id of input dataObject as String\nOutput DataFrame must be set with`setOutputDf(df)` ." + }, + "SQLDfTransformer": { + "type": "object", + "properties": { + "type": { + "const": "SQLDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "code": { + "type": "string", + "description": "SQL code for transformation.\nUse tokens %{} to replace with runtimeOptions in SQL code.\nExample: \\\"select * from test where run = %{runId}\\\"\nThe special token %{inputViewName} or ${inputViewName_} can be used to insert the temporary view name.\nThe input name is either the name of the DataObject, or the name of the previous transformation\nif this is not the first transformation of the chain. Make sure to change the standard name of\nthe previous transformation in that case." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "SQLDfTransformer", + "required": [ + "type", + "code" + ], + "additionalProperties": false, + "description": "Configuration of a custom GenericDataFrame transformation between one input and one output (1:1) as SQL code.\nThe input data is available as temporary view in SQL. The inputs name is either the name of the DataObject,\nor the name of the previous transformation, if this is not the first transformation of the chain. Also note that to create\nthe name of temporary view, special characters are replaced by underscores and a postfix \\\"_sdltemp\\\" is added.\nIt is therefore recommended to use special token %{inputViewName} or ${inputViewName_} that will be\nreplaced with the name of the temporary view at runtime." + }, + "ScalaClassGenericDfTransformer": { + "type": "object", + "properties": { + "type": { + "const": "ScalaClassGenericDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "className": { + "type": "string", + "description": "class name implementing trait[[CustomGenericDfTransformer]]" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaClassGenericDfTransformer", + "required": [ + "type", + "className" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1) as Java/Scala Class.\nDefine a transform function which receives a DataObjectId, a DataFrame and a map of options and has to return a\nDataFrame. The Java/Scala class has to implement interface[[CustomGenericDfTransformer]] ." + }, + "ScalaClassSnowparkDfTransformer": { + "type": "object", + "properties": { + "type": { + "const": "ScalaClassSnowparkDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "className": { + "type": "string", + "description": "class name implementing trait[[CustomSnowparkDfTransformer]]" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaClassSnowparkDfTransformer", + "required": [ + "type", + "className" + ], + "additionalProperties": false, + "description": "Configuration of a custom Snowpark-DataFrame transformation between one input and one output (1:1) as Java/Scala Class.\nDefine a transform function which receives a DataObjectId, a DataFrame and a map of options and has to return a\nDataFrame. The Java/Scala class has to implement interface[[CustomSnowparkDfTransformer]] ." + }, + "ScalaClassSparkDfTransformer": { + "type": "object", + "properties": { + "type": { + "const": "ScalaClassSparkDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "className": { + "type": "string", + "description": "class name implementing trait[[CustomDfTransformer]]" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaClassSparkDfTransformer", + "required": [ + "type", + "className" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1) as Java/Scala Class.\nDefine a transform function which receives a DataObjectId, a DataFrame and a map of options and has to return a\nDataFrame. The Java/Scala class has to implement interface[[CustomDfTransformer]] ." + }, + "ScalaClassSparkDsTransformer": { + "type": "object", + "properties": { + "type": { + "const": "ScalaClassSparkDsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "transformerClassName": { + "type": "string", + "description": "class name implementing trait[[CustomDsTransformer]]" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaClassSparkDsTransformer", + "required": [ + "type", + "transformerClassName" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-Dataset transformation between one input and one output (1:1) as Java/Scala Class.\nDefine a transform function which receives a SparkSession, a Dataset and a map of options and has to return a Dataset.\nThe Java/Scala class has to implement interface[[CustomDsTransformer]] ." + }, + "ScalaCodeSparkDfTransformer": { + "type": "object", + "properties": { + "type": { + "const": "ScalaCodeSparkDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "code": { + "type": "string", + "description": "Scala code for transformation. The scala code needs to be a function of type[[fnTransformType]] ." + }, + "file": { + "type": "string", + "description": "File where scala code for transformation is loaded from. The scala code in the file needs to be a function of type[[fnTransformType]] ." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaCodeSparkDfTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1) as Scala code which is compiled at runtime.\nDefine a transform function which receives a DataObjectId, a DataFrame and a map of options and has to return a\nDataFrame. The scala code has to implement a function of type[[fnTransformType]] ." + }, + "ScalaNotebookSparkDfTransformer": { + "type": "object", + "properties": { + "type": { + "const": "ScalaNotebookSparkDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "url": { + "type": "string", + "description": "Url to download notebook in IPYNB-format, which defines transformation." + }, + "functionName": { + "type": "string", + "description": "The notebook needs to contain a Scala-function with this name and type[[fnTransformType]] ." + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "optional authentication information for webservice, e.g. BasicAuthMode for user/pw authentication" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaNotebookSparkDfTransformer", + "required": [ + "type", + "url", + "functionName" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1) as Scala code which is compiled at runtime.\nThe code is loaded from a Notebook. It should define a transform function with a configurable name, which receives a DataObjectId, a DataFrame\nand a map of options and has to return a DataFrame, see also ([[fnTransformType]] ).\nNotebook-cells starting with \\\"//!IGNORE\\\" will be ignored." + }, + "SparkRepartitionTransformer": { + "type": "object", + "properties": { + "type": { + "const": "SparkRepartitionTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "numberOfTasksPerPartition": { + "type": "integer", + "description": "Number of Spark tasks to create per partition value by repartitioning the DataFrame." + }, + "keyCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional key columns to distribute records over Spark tasks inside a partition value." + } + }, + "title": "SparkRepartitionTransformer", + "required": [ + "type", + "numberOfTasksPerPartition" + ], + "additionalProperties": false, + "description": "Repartition DataFrame\nFor detailled description about repartitioning DataFrames see also[[SparkRepartitionDef]]" + }, + "StandardizeColNamesTransformer": { + "type": "object", + "properties": { + "type": { + "const": "StandardizeColNamesTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "camelCaseToLower": { + "type": "boolean", + "description": "If selected, converts Camel case names to lower case with underscores, i.e. TestString -> test_string, testABCtest -> test_ABCtest\nOtherwise converts just to lower case." + }, + "normalizeToAscii": { + "type": "boolean", + "description": "If selected, converts UTF-8 special characters (e.g. diacritics, umlauts) to ASCII chars (best effort), i.e. Öffi_émily -> Oeffi_emily" + }, + "removeNonStandardSQLNameChars": { + "type": "boolean", + "description": "Remove all chars from a string which dont belong to lowercase SQL standard naming characters, i.e abc$!-& -> abc" + } + }, + "title": "StandardizeColNamesTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Standardizes column names to be used without quoting by using camelCase to lower_case_with_underscore rule (default), and further cleanup rules for special characters (default).\nParameters below can be used to disable specific rules if needed." + }, + "StandardizeSparkDatatypesTransformer": { + "type": "object", + "properties": { + "type": { + "const": "StandardizeSparkDatatypesTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + } + }, + "title": "StandardizeSparkDatatypesTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Standardize datatypes of a Spark-DataFrame.\nCurrent implementation converts all decimal datatypes to a corresponding integral or float datatype" + }, + "WhitelistTransformer": { + "type": "object", + "properties": { + "type": { + "const": "WhitelistTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "columnWhitelist": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of columns to keep from DataFrame" + } + }, + "title": "WhitelistTransformer", + "required": [ + "type", + "columnWhitelist" + ], + "additionalProperties": false, + "description": "Apply a column whitelist to a DataFrame." + } + }, + "AuthMode": { + "AuthHeaderMode": { + "type": "object", + "properties": { + "type": { + "const": "AuthHeaderMode" + }, + "headerName": { + "type": "string" + }, + "secret": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "secretVariable": { + "type": "string", + "deprecated": true + } + }, + "title": "AuthHeaderMode", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Connect by custom authorization header" + }, + "BasicAuthMode": { + "type": "object", + "properties": { + "type": { + "const": "BasicAuthMode" + }, + "user": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "password": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "userVariable": { + "type": "string", + "deprecated": true + }, + "passwordVariable": { + "type": "string", + "deprecated": true + } + }, + "title": "BasicAuthMode", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Connect by basic authentication" + }, + "CustomHttpAuthMode": { + "type": "object", + "properties": { + "type": { + "const": "CustomHttpAuthMode" + }, + "className": { + "type": "string", + "description": "class name implementing trait[[CustomHttpAuthModeLogic]]" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "description": "Options to pass to the custom auth mode logic in prepare function.\n\nThe value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + } + }, + "title": "CustomHttpAuthMode", + "required": [ + "type", + "className", + "options" + ], + "additionalProperties": false, + "description": "Connect with custom HTTP authentication" + }, + "KeycloakClientSecretAuthMode": { + "type": "object", + "properties": { + "type": { + "const": "KeycloakClientSecretAuthMode" + }, + "ssoServer": { + "type": "string" + }, + "ssoRealm": { + "type": "string" + }, + "ssoGrantType": { + "type": "string" + }, + "clientIdVariable": { + "type": "string", + "deprecated": true + }, + "clientId": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "clientSecretVariable": { + "type": "string", + "deprecated": true + }, + "clientSecret": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + } + }, + "title": "KeycloakClientSecretAuthMode", + "required": [ + "type", + "ssoServer", + "ssoRealm", + "ssoGrantType" + ], + "additionalProperties": false, + "description": "Connect by using Keycloak to manage token and token refresh giving clientId/secret as information.\nFor HTTP Connection this is used as Bearer token in Authorization header." + }, + "PublicKeyAuthMode": { + "type": "object", + "properties": { + "type": { + "const": "PublicKeyAuthMode" + }, + "userVariable": { + "type": "string", + "deprecated": true + }, + "user": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + } + }, + "title": "PublicKeyAuthMode", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Validate by user and private/public key\nPrivate key is read from .ssh" + }, + "SASLSCRAMAuthMode": { + "type": "object", + "properties": { + "type": { + "const": "SASLSCRAMAuthMode" + }, + "username": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "passwordVariable": { + "type": "string", + "deprecated": true + }, + "password": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "sslMechanism": { + "type": "string" + }, + "truststorePath": { + "type": "string" + }, + "truststoreType": { + "type": "string" + }, + "truststorePassVariable": { + "type": "string", + "deprecated": true + }, + "truststorePass": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + } + }, + "title": "SASLSCRAMAuthMode", + "required": [ + "type", + "username", + "sslMechanism", + "truststorePath" + ], + "additionalProperties": false, + "description": "Validate by SASL_SSL Authentication : user / password and truststore" + }, + "SSLCertsAuthMode": { + "type": "object", + "properties": { + "type": { + "const": "SSLCertsAuthMode" + }, + "keystorePath": { + "type": "string" + }, + "keystoreType": { + "type": "string" + }, + "keystorePassVariable": { + "type": "string", + "deprecated": true + }, + "keystorePass": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "truststorePath": { + "type": "string" + }, + "truststoreType": { + "type": "string" + }, + "truststorePassVariable": { + "type": "string", + "deprecated": true + }, + "truststorePass": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + } + }, + "title": "SSLCertsAuthMode", + "required": [ + "type", + "keystorePath", + "truststorePath" + ], + "additionalProperties": false, + "description": "Validate by SSL Certificates : Only location an credentials. Additional attributes should be\nsupplied via options map" + }, + "TokenAuthMode": { + "type": "object", + "properties": { + "type": { + "const": "TokenAuthMode" + }, + "tokenVariable": { + "type": "string", + "deprecated": true + }, + "token": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + } + }, + "title": "TokenAuthMode", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Connect by token\nFor HTTP Connection this is used as Bearer token in Authorization header." + } + }, + "GenericDfsTransformer": { + "DfTransformerWrapperDfsTransformer": { + "type": "object", + "properties": { + "type": { + "const": "DfTransformerWrapperDfsTransformer" + }, + "transformer": { + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "const": "BlacklistTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "columnBlacklist": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of columns to exclude from DataFrame" + } + }, + "title": "BlacklistTransformer", + "required": [ + "columnBlacklist", + "type" + ], + "additionalProperties": false, + "description": "Apply a column blacklist to a DataFrame." + }, + { + "type": "object", + "properties": { + "type": { + "const": "StandardizeColNamesTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "camelCaseToLower": { + "type": "boolean", + "description": "If selected, converts Camel case names to lower case with underscores, i.e. TestString -> test_string, testABCtest -> test_ABCtest\nOtherwise converts just to lower case." + }, + "normalizeToAscii": { + "type": "boolean", + "description": "If selected, converts UTF-8 special characters (e.g. diacritics, umlauts) to ASCII chars (best effort), i.e. Öffi_émily -> Oeffi_emily" + }, + "removeNonStandardSQLNameChars": { + "type": "boolean", + "description": "Remove all chars from a string which dont belong to lowercase SQL standard naming characters, i.e abc$!-& -> abc" + } + }, + "title": "StandardizeColNamesTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Standardizes column names to be used without quoting by using camelCase to lower_case_with_underscore rule (default), and further cleanup rules for special characters (default).\nParameters below can be used to disable specific rules if needed." + }, + { + "type": "object", + "properties": { + "type": { + "const": "DataValidationTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "rules": { + "type": "array", + "items": { + "$ref": "#/definitions/ValidationRule/RowLevelValidationRule" + }, + "description": "list of validation rules to apply to the DataFrame" + }, + "errorsColumn": { + "type": "string", + "description": "Optional column name for the list of error messages. Default is \\\"errors\\\"." + }, + "subFeedTypeForValidation": { + "type": "string", + "description": "For validating the rule expression, the runtime subFeedType is not yet known.\nBy default SparkSubFeed langauge is used, but you can configure a different one if needed." + } + }, + "title": "DataValidationTransformer", + "required": [ + "rules", + "type" + ], + "additionalProperties": false, + "description": "Apply validation rules to a DataFrame and collect potential violation error messages in a new column." + }, + { + "type": "object", + "properties": { + "type": { + "const": "DecryptColumnsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "decryptColumns": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of columns [columnA, columnB] to be encrypted" + }, + "keyVariable": { + "type": "string", + "description": "contains the id of the provider and the name of the secret with format #,\ne.g. ENV# to get a secret from an environment variable OR CLEAR#mYsEcReTkeY", + "deprecated": true + }, + "key": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "algorithm": { + "type": "string" + } + }, + "title": "DecryptColumnsTransformer", + "required": [ + "decryptColumns", + "type" + ], + "additionalProperties": false, + "description": "Decryption of specified columns using AES/GCM algorithm." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SparkDfTransformer" + } + }, + "title": "SparkDfTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Interface to implement Spark-DataFrame transformers working with one input and one output (1:1)" + }, + { + "type": "object", + "properties": { + "type": { + "const": "OptionsSparkDfTransformer" + } + }, + "title": "OptionsSparkDfTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Interface to implement Spark-DataFrame transformers working with one input and one output (1:1) and options.\nThis trait extends OptionsGenericDfTransformer and passes a map of options as parameter to the transform function.\nThis is mainly used by custom transformers." + }, + { + "type": "object", + "properties": { + "type": { + "const": "ScalaClassSnowparkDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "className": { + "type": "string", + "description": "class name implementing trait[[CustomSnowparkDfTransformer]]" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaClassSnowparkDfTransformer", + "required": [ + "className", + "type" + ], + "additionalProperties": false, + "description": "Configuration of a custom Snowpark-DataFrame transformation between one input and one output (1:1) as Java/Scala Class.\nDefine a transform function which receives a DataObjectId, a DataFrame and a map of options and has to return a\nDataFrame. The Java/Scala class has to implement interface[[CustomSnowparkDfTransformer]] ." + }, + { + "type": "object", + "properties": { + "type": { + "const": "ScalaNotebookSparkDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "url": { + "type": "string", + "description": "Url to download notebook in IPYNB-format, which defines transformation." + }, + "functionName": { + "type": "string", + "description": "The notebook needs to contain a Scala-function with this name and type[[fnTransformType]] ." + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "optional authentication information for webservice, e.g. BasicAuthMode for user/pw authentication" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaNotebookSparkDfTransformer", + "required": [ + "url", + "functionName", + "type" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1) as Scala code which is compiled at runtime.\nThe code is loaded from a Notebook. It should define a transform function with a configurable name, which receives a DataObjectId, a DataFrame\nand a map of options and has to return a DataFrame, see also ([[fnTransformType]] ).\nNotebook-cells starting with \\\"//!IGNORE\\\" will be ignored." + }, + { + "type": "object", + "properties": { + "type": { + "const": "ScalaClassSparkDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "className": { + "type": "string", + "description": "class name implementing trait[[CustomDfTransformer]]" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaClassSparkDfTransformer", + "required": [ + "className", + "type" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1) as Java/Scala Class.\nDefine a transform function which receives a DataObjectId, a DataFrame and a map of options and has to return a\nDataFrame. The Java/Scala class has to implement interface[[CustomDfTransformer]] ." + }, + { + "type": "object", + "properties": { + "type": { + "const": "ScalaClassGenericDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "className": { + "type": "string", + "description": "class name implementing trait[[CustomGenericDfTransformer]]" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaClassGenericDfTransformer", + "required": [ + "className", + "type" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1) as Java/Scala Class.\nDefine a transform function which receives a DataObjectId, a DataFrame and a map of options and has to return a\nDataFrame. The Java/Scala class has to implement interface[[CustomGenericDfTransformer]] ." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SQLDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "code": { + "type": "string", + "description": "SQL code for transformation.\nUse tokens %{} to replace with runtimeOptions in SQL code.\nExample: \\\"select * from test where run = %{runId}\\\"\nThe special token %{inputViewName} or ${inputViewName_} can be used to insert the temporary view name.\nThe input name is either the name of the DataObject, or the name of the previous transformation\nif this is not the first transformation of the chain. Make sure to change the standard name of\nthe previous transformation in that case." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "SQLDfTransformer", + "required": [ + "code", + "type" + ], + "additionalProperties": false, + "description": "Configuration of a custom GenericDataFrame transformation between one input and one output (1:1) as SQL code.\nThe input data is available as temporary view in SQL. The inputs name is either the name of the DataObject,\nor the name of the previous transformation, if this is not the first transformation of the chain. Also note that to create\nthe name of temporary view, special characters are replaced by underscores and a postfix \\\"_sdltemp\\\" is added.\nIt is therefore recommended to use special token %{inputViewName} or ${inputViewName_} that will be\nreplaced with the name of the temporary view at runtime." + }, + { + "type": "object", + "properties": { + "type": { + "const": "StandardizeSparkDatatypesTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + } + }, + "title": "StandardizeSparkDatatypesTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Standardize datatypes of a Spark-DataFrame.\nCurrent implementation converts all decimal datatypes to a corresponding integral or float datatype" + }, + { + "type": "object", + "properties": { + "type": { + "const": "EncryptColumnsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "encryptColumns": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of columns [columnA, columnB] to be encrypted" + }, + "keyVariable": { + "type": "string", + "description": "contains the id of the provider and the name of the secret with format #,\ne.g. ENV# to get a secret from an environment variable OR CLEAR#mYsEcReTkeY", + "deprecated": true + }, + "key": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + } + }, + "title": "EncryptColumnsTransformer", + "required": [ + "encryptColumns", + "type" + ], + "additionalProperties": false, + "description": "Encryption of specified columns using AES/GCM algorithm." + }, + { + "type": "object", + "properties": { + "type": { + "const": "ScalaClassSparkDsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "transformerClassName": { + "type": "string", + "description": "class name implementing trait[[CustomDsTransformer]]" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaClassSparkDsTransformer", + "required": [ + "transformerClassName", + "type" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-Dataset transformation between one input and one output (1:1) as Java/Scala Class.\nDefine a transform function which receives a SparkSession, a Dataset and a map of options and has to return a Dataset.\nThe Java/Scala class has to implement interface[[CustomDsTransformer]] ." + }, + { + "type": "object", + "properties": { + "type": { + "const": "ScalaCodeSparkDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "code": { + "type": "string", + "description": "Scala code for transformation. The scala code needs to be a function of type[[fnTransformType]] ." + }, + "file": { + "type": "string", + "description": "File where scala code for transformation is loaded from. The scala code in the file needs to be a function of type[[fnTransformType]] ." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaCodeSparkDfTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1) as Scala code which is compiled at runtime.\nDefine a transform function which receives a DataObjectId, a DataFrame and a map of options and has to return a\nDataFrame. The scala code has to implement a function of type[[fnTransformType]] ." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SparkRepartitionTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "numberOfTasksPerPartition": { + "type": "integer", + "description": "Number of Spark tasks to create per partition value by repartitioning the DataFrame." + }, + "keyCols": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional key columns to distribute records over Spark tasks inside a partition value." + } + }, + "title": "SparkRepartitionTransformer", + "required": [ + "numberOfTasksPerPartition", + "type" + ], + "additionalProperties": false, + "description": "Repartition DataFrame\nFor detailled description about repartitioning DataFrames see also[[SparkRepartitionDef]]" + }, + { + "type": "object", + "properties": { + "type": { + "const": "PythonCodeDfTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "code": { + "type": "string", + "description": "Optional python code to user for python transformation. The python code can use variables inputDf, dataObjectId and options. The transformed DataFrame has to be set with setOutputDf." + }, + "file": { + "type": "string", + "description": "Optional file with python code to use for python transformation. The python code can use variables inputDf, dataObjectId and options. The transformed DataFrame has to be set with setOutputDf." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "PythonCodeDfTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1) as Python/PySpark code.\nNote that this transformer needs a Python and PySpark environment installed.\nPySpark session is initialize and available under variables`sc`,`session`,`sqlContext`.\nOther variables available are\n-`inputDf`: Input DataFrame\n-`options`: Transformation options as Map[String,String]\n-`dataObjectId`: Id of input dataObject as String\nOutput DataFrame must be set with`setOutputDf(df)` ." + }, + { + "type": "object", + "properties": { + "type": { + "const": "OptionsGenericDfTransformer" + } + }, + "title": "OptionsGenericDfTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Interface to implement GenericDataFrame transformers working with one input and one output (1:1) and options.\nThis trait extends GenericDfTransformerDef to pass a map of options as parameter to the transform function.\nThis is mainly used by custom transformers." + }, + { + "type": "object", + "properties": { + "type": { + "const": "FilterTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "filterClause": { + "type": "string", + "description": "Spark SQL expression to filter the DataFrame" + }, + "subFeedTypeForValidation": { + "type": "string", + "description": "When parsing the configuration the runtime subFeedType for validating the filter expression is not yet known.\nBy default SparkSubFeed langauge is used, but you can configure a different one if needed." + } + }, + "title": "FilterTransformer", + "required": [ + "filterClause", + "type" + ], + "additionalProperties": false, + "description": "Apply a filter condition to a DataFrame." + }, + { + "type": "object", + "properties": { + "type": { + "const": "AdditionalColumnsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "additionalColumns": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [column name, spark sql expression] to be added as additional columns to the dataframe.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]]and added to the DataFrame as literal columns.\n[[DefaultExpressionData]] contains informations from the context of the SDLB job, like runId or feed name." + }, + "additionalDerivedColumns": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [column name, spark sql expression] to be added as additional columns to the dataframe.\nThe spark sql expressions are evaluated against the input DataFrame and added to the DataFrame as derived columns." + } + }, + "title": "AdditionalColumnsTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Add additional columns to the DataFrame by extracting information from the context or derived from input columns." + }, + { + "type": "object", + "properties": { + "type": { + "const": "WhitelistTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "columnWhitelist": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of columns to keep from DataFrame" + } + }, + "title": "WhitelistTransformer", + "required": [ + "columnWhitelist", + "type" + ], + "additionalProperties": false, + "description": "Apply a column whitelist to a DataFrame." + } + ], + "description": "Configuration for a GenericDfTransformerDef to be applied" + }, + "subFeedsToApply": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Names of SubFeeds the transformation should be applied to." + } + }, + "title": "DfTransformerWrapperDfsTransformer", + "required": [ + "type", + "transformer", + "subFeedsToApply" + ], + "additionalProperties": false, + "description": "A Transformer to use single DataFrame Transformers as multiple DataFrame Transformers.\nThis works by selecting the SubFeeds (DataFrames) the single DataFrame Transformer should be applied to.\nAll other SubFeeds will be passed through without transformation." + }, + "PythonCodeDfsTransformer": { + "type": "object", + "properties": { + "type": { + "const": "PythonCodeDfsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "code": { + "type": "string", + "description": "Optional python code to user for python transformation. The python code can use variables inputDfs and options. The transformed DataFrame has to be set with setOutputDfs." + }, + "file": { + "type": "string", + "description": "Optional file with python code to use for python transformation. The python code can use variables inputDfs and options. The transformed DataFrames has to be set with setOutputDfs." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "PythonCodeDfsTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between many inputs and many outputs (n:m) as Python/PySpark code.\nNote that this transformer needs a Python and PySpark environment installed.\nPySpark session is initialize and available under variables`sc`,`session`,`sqlContext`.\nOther variables available are\n-`inputDfs`: Input DataFrames\n-`options`: Transformation options as Map[String,String]\nOutput DataFrames must be set with`setOutputDfs(dict)` ." + }, + "SQLDfsTransformer": { + "type": "object", + "properties": { + "type": { + "const": "SQLDfsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "code": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Map of output names and corresponding SQL code for transformation.\nIf this is the last transformation in the chain, the output name has to match an output DataObject id,\notherwise it can be any name which will then be available in the next transformation.\nUse tokens %{} to replace with runtimeOptions in SQL code.\nExample: \\\"select * from test where run = %{runId}\\\"\nThe special token ${inputViewName_} can be used to insert the name of temporary views.\nThe input name is either the id of an input DataObject, or the name of an output of the previous transformation\nif this is not the first transformation of the chain." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "SQLDfsTransformer", + "required": [ + "type", + "code" + ], + "additionalProperties": false, + "description": "Configuration of a custom GenericDataFrame transformation between many inputs and many outputs (n:m) as SQL code.\nThe input data is available as temporary views in SQL. As name for the temporary views the input DataObjectId is used\n(special characters are replaces by underscores).\nThe input data is available as temporary view in SQL. The input name is either an id of the input DataObject,\nor the name of an output of the previous transformation if this is not the first transformation of the chain.\nAlso note that to create the name of temporary view, special characters are replaced by underscores and a postfix \\\"_sdltemp\\\" is added.\nIt is therefore recommended to use the special token ${inputViewName_}, that will be replaced with the name\nof the temporary view at runtime.\n\nNote that you can access arbitrary tables from the metastore in the SQL code, but this is against the principle of SDLB\nto access data through DataObjects. Accessing tables directly in SQL code has a negative impact on the maintainability of the project." + }, + "ScalaClassGenericDfsTransformer": { + "type": "object", + "properties": { + "type": { + "const": "ScalaClassGenericDfsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "className": { + "type": "string", + "description": "class name implementing trait[[CustomGenericDfsTransformer]]" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaClassGenericDfsTransformer", + "required": [ + "type", + "className" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between many inputs and many outputs (n:m)\nDefine a transform function which receives a map of input DataObjectIds with DataFrames and a map of options and as\nto return a map of output DataObjectIds with DataFrames, see also trait[[CustomGenericDfsTransformer]] ." + }, + "ScalaClassSnowparkDfsTransformer": { + "type": "object", + "properties": { + "type": { + "const": "ScalaClassSnowparkDfsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "className": { + "type": "string", + "description": "class name implementing trait[[CustomSnowparkDfsTransformer]]" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaClassSnowparkDfsTransformer", + "required": [ + "type", + "className" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between many inputs and many outputs (n:m)\nDefine a transform function which receives a map of input DataObjectIds with DataFrames and a map of options and as\nto return a map of output DataObjectIds with DataFrames, see also trait[[CustomSnowparkDfsTransformer]] ." + }, + "ScalaClassSparkDfsTransformer": { + "type": "object", + "properties": { + "type": { + "const": "ScalaClassSparkDfsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "className": { + "type": "string", + "description": "class name implementing trait[[CustomDfsTransformer]]" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaClassSparkDfsTransformer", + "required": [ + "type", + "className" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between many inputs and many outputs (n:m)\nDefine a transform function which receives a map of input DataObjectIds with DataFrames and a map of options and as\nto return a map of output DataObjectIds with DataFrames, see also trait[[CustomDfsTransformer]] ." + }, + "ScalaClassSparkDsNTo1Transformer": { + "type": "object", + "properties": { + "type": { + "const": "ScalaClassSparkDsNTo1Transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "className": { + "type": "string", + "description": "Class name implementing trait[[CustomDfsTransformer]]" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + }, + "parameterResolution": { + "type": "string", + "description": "By default parameter resolution for transform function uses input Datasets id to match the corresponding parameter name.\nBut there are other options, see[[ParameterResolution]] .", + "enum": [ + "DataObjectOrdering ", + "DataObjectId " + ] + }, + "strictInputValidation": { + "type": "boolean", + "description": "Enforce that the number of input dataobjects must be the same as the number of input datasets. False by default,\nbecause when chaining multiple transformations in the same action, you may not need all output Data objects of the previous transformations.\nHowever, having more input parameters in your transform method than Dataobjects will always fail." + }, + "inputColumnAutoSelect": { + "type": "boolean", + "description": "Determine if the input-datasets should contain exactly the columns defined by the corresponding case class (spark does not ensure this out of the box). True per default." + }, + "outputColumnAutoSelect": { + "type": "boolean", + "description": "Determine if the output-dataset should contain exactly the columns defined by the corresponding case class (spark does not ensure this out of the box). True per default." + }, + "addPartitionValuesToOutput": { + "type": "boolean", + "description": "If set to true and if one partition-value is processed at a time, the partition-columns will be added to the output-dataset\nIf more than one partition-value is processed simultaneously, the transformation will fail because it cannot\ndetermine which row should get which partition-value. False by default." + }, + "outputDatasetId": { + "type": "string", + "description": "Optional id of the output Dataset. Default is the id of the Actions first output DataObject." + } + }, + "title": "ScalaClassSparkDsNTo1Transformer", + "required": [ + "type", + "className" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-Dataset transformation between N inputs and 1 outputs (N:1) as Java/Scala Class\nDefine a transform function that receives a SparkSession, a map of options and as many DataSets as you want, and that has to return one Dataset.\nThe Java/Scala class has to implement interface[[CustomDsNto1Transformer]] ." + }, + "ScalaCodeSparkDfsTransformer": { + "type": "object", + "properties": { + "type": { + "const": "ScalaCodeSparkDfsTransformer" + }, + "name": { + "type": "string", + "description": "name of the transformer" + }, + "description": { + "type": "string", + "description": "Optional description of the transformer" + }, + "code": { + "type": "string", + "description": "Scala code for transformation. The scala code needs to be a function of type[[fnTransformType]] ." + }, + "file": { + "type": "string", + "description": "File where scala code for transformation is loaded from. The scala code in the file needs to be a function of type[[fnTransformType]] ." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "ScalaCodeSparkDfsTransformer", + "required": [ + "type" + ], + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between many inputs and many outputs (n:m) as Scala code which is compiled at runtime.\nDefine a transform function which receives a map of input DataObjectIds with DataFrames and a map of options and has\nto return a map of output DataObjectIds with DataFrames. The scala code has to implement a function of type[[fnTransformType]] ." + } + }, + "Action": { + "CopyAction": { + "type": "object", + "properties": { + "type": { + "const": "CopyAction" + }, + "inputId": { + "type": "string", + "description": "inputs DataObject" + }, + "outputId": { + "type": "string", + "description": "output DataObject" + }, + "deleteDataAfterRead": { + "type": "boolean", + "description": "a flag to enable deletion of input partitions after copying." + }, + "transformer": { + "type": "object", + "properties": { + "className": { + "type": "string", + "description": "Optional class name implementing trait[[CustomDfTransformer]]" + }, + "scalaFile": { + "type": "string", + "description": "Optional file where scala code for transformation is loaded from. The scala code in the file needs to be a function of type[[fnTransformType]] ." + }, + "scalaCode": { + "type": "string", + "description": "Optional scala code for transformation. The scala code needs to be a function of type[[fnTransformType]] ." + }, + "sqlCode": { + "type": "string", + "description": "Optional SQL code for transformation.\nUse tokens %{} to replace with runtimeOptions in SQL code.\nExample: \\\"select * from test where run = %{runId}\\\"" + }, + "pythonFile": { + "type": "string", + "description": "Optional pythonFile to use for python transformation. The python code can use variables inputDf, dataObjectId and options. The transformed DataFrame has to be set with setOutputDf." + }, + "pythonCode": { + "type": "string", + "description": "Optional pythonCode to use for python transformation. The python code can use variables inputDf, dataObjectId and options. The transformed DataFrame has to be set with setOutputDf." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "CustomDfTransformerConfig", + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1)\nDefine a transform function which receives a DataObjectIds, a DataFrames and a map of options and has to return a\nDataFrame, see also[[CustomDfTransformer]].\n\nNote about Python transformation: Environment with Python and PySpark needed.\nPySpark session is initialize and available under variables`sc`,`session`,`sqlContext`.\nOther variables available are\n-`inputDf`: Input DataFrame\n-`options`: Transformation options as Map[String,String]\n-`dataObjectId`: Id of input dataObject as String\nOutput DataFrame must be set with`setOutputDf(df)` ." + }, + "transformers": { + "type": "array", + "items": { + "oneOf": [ + { + "$ref": "#/definitions/GenericDfTransformer/AdditionalColumnsTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/BlacklistTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/DataValidationTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/DecryptColumnsTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/EncryptColumnsTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/FilterTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/PythonCodeDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/SQLDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaClassGenericDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaClassSnowparkDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaClassSparkDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaClassSparkDsTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaCodeSparkDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaNotebookSparkDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/SparkRepartitionTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/StandardizeColNamesTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/StandardizeSparkDatatypesTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/WhitelistTransformer" + } + ] + }, + "description": "optional list of transformations to apply. See[[spark.transformer]] for a list of included Transformers.\nThe transformations are applied according to the lists ordering." + }, + "breakDataFrameLineage": { + "type": "boolean", + "description": "Stop propagating input DataFrame through action and instead get a new DataFrame from DataObject.\nThis can help to save memory and performance if the input DataFrame includes many transformations from previous Actions.\nThe new DataFrame will be initialized according to the SubFeed\\'s partitionValues." + }, + "persist": { + "type": "boolean", + "description": "Force persisting input DataFrame\\'s on Disk.\nThis improves performance if dataFrame is used multiple times in the transformation and can serve as a recovery point\nin case a task get\\'s lost.\nNote that DataFrames are persisted automatically by the previous Action if later Actions need the same data. To avoid this\nbehaviour set breakDataFrameLineage=false." + }, + "executionMode": { + "oneOf": [ + { + "$ref": "#/definitions/ExecutionMode/CustomMode" + }, + { + "$ref": "#/definitions/ExecutionMode/CustomPartitionMode" + }, + { + "$ref": "#/definitions/ExecutionMode/DataFrameIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/DataObjectStateIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/FailIfNoPartitionValuesMode" + }, + { + "$ref": "#/definitions/ExecutionMode/FileIncrementalMoveMode" + }, + { + "$ref": "#/definitions/ExecutionMode/KafkaStateIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/PartitionDiffMode" + }, + { + "$ref": "#/definitions/ExecutionMode/ProcessAllMode" + }, + { + "$ref": "#/definitions/ExecutionMode/SparkStreamingMode" + } + ], + "description": "optional execution mode for this Action" + }, + "executionCondition": { + "type": "object", + "properties": { + "expression": { + "type": "string", + "description": "Condition formulated as Spark SQL. The attributes available are dependent on the context." + }, + "description": { + "type": "string", + "description": "A textual description of the condition to be shown in error messages." + } + }, + "title": "Condition", + "required": [ + "expression" + ], + "additionalProperties": false, + "description": "Definition of a Spark SQL condition with description.\nThis is used for example to define failConditions of[[PartitionDiffMode]] ." + }, + "metricsFailCondition": { + "type": "string", + "description": "optional spark sql expression evaluated as where-clause against dataframe of metrics. Available columns are dataObjectId, key, value.\nIf there are any rows passing the where clause, a MetricCheckFailed exception is thrown." + }, + "saveModeOptions": { + "oneOf": [ + { + "$ref": "#/definitions/SaveModeOptions/SaveModeGenericOptions" + }, + { + "$ref": "#/definitions/SaveModeOptions/SaveModeMergeOptions" + } + ], + "description": "override and parametrize saveMode set in output DataObject configurations when writing to DataObjects." + }, + "metadata": { + "$ref": "#/definitions/Others/ActionMetadata" + }, + "agentId": { + "type": "string" + } + }, + "title": "CopyAction", + "required": [ + "type", + "inputId", + "outputId" + ], + "additionalProperties": false, + "description": "This[[Action]] copies data between an input and output DataObject using DataFrames.\nThe input DataObject reads the data and converts it to a DataFrame according to its definition.\nThe DataFrame might be transformed using SQL or DataFrame transformations.\nThen the output DataObjects writes the DataFrame to the output according to its definition." + }, + "CustomDataFrameAction": { + "type": "object", + "properties": { + "type": { + "const": "CustomDataFrameAction" + }, + "inputIds": { + "type": "array", + "items": { + "type": "string" + }, + "description": "input DataObject\\'s" + }, + "outputIds": { + "type": "array", + "items": { + "type": "string" + }, + "description": "output DataObject\\'s" + }, + "transformer": { + "type": "object", + "properties": { + "className": { + "type": "string", + "description": "Optional class name implementing trait[[CustomDfsTransformer]]" + }, + "scalaFile": { + "type": "string", + "description": "Optional file where scala code for transformation is loaded from. The scala code in the file needs to be a function of type[[fnTransformType]] ." + }, + "scalaCode": { + "type": "string", + "description": "Optional scala code for transformation. The scala code needs to be a function of type[[fnTransformType]] ." + }, + "sqlCode": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Optional map of output DataObject id and corresponding SQL Code.\nUse tokens %{} to replace with runtimeOptions in SQL code.\nExample: \\\"select * from test where run = %{runId}\\\"" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "CustomDfsTransformerConfig", + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between many inputs and many outputs (n:m).\nDefine a transform function which receives a map of input DataObjectIds with DataFrames and a map of options and has\nto return a map of output DataObjectIds with DataFrames, see also trait[[CustomDfsTransformer]] ." + }, + "transformers": { + "type": "array", + "items": { + "oneOf": [ + { + "$ref": "#/definitions/GenericDfsTransformer/DfTransformerWrapperDfsTransformer" + }, + { + "$ref": "#/definitions/GenericDfsTransformer/PythonCodeDfsTransformer" + }, + { + "$ref": "#/definitions/GenericDfsTransformer/SQLDfsTransformer" + }, + { + "$ref": "#/definitions/GenericDfsTransformer/ScalaClassGenericDfsTransformer" + }, + { + "$ref": "#/definitions/GenericDfsTransformer/ScalaClassSnowparkDfsTransformer" + }, + { + "$ref": "#/definitions/GenericDfsTransformer/ScalaClassSparkDfsTransformer" + }, + { + "$ref": "#/definitions/GenericDfsTransformer/ScalaClassSparkDsNTo1Transformer" + }, + { + "$ref": "#/definitions/GenericDfsTransformer/ScalaCodeSparkDfsTransformer" + } + ] + }, + "description": "list of transformations to apply. See[[spark.transformer]] for a list of included Transformers.\nThe transformations are applied according to the ordering of the list.\nNote that all outputs of previous transformers are kept as input for next transformer,\nbut in the end only outputs of the last transformer are mapped to output DataObjects." + }, + "breakDataFrameLineage": { + "type": "boolean", + "description": "Stop propagating input DataFrame through action and instead get a new DataFrame from DataObject.\nThis can help to save memory and performance if the input DataFrame includes many transformations from previous Actions.\nThe new DataFrame will be initialized according to the SubFeed\\'s partitionValues." + }, + "persist": { + "type": "boolean", + "description": "Force persisting input DataFrame\\'s on Disk.\nThis improves performance if dataFrame is used multiple times in the transformation and can serve as a recovery point\nin case a task get\\'s lost.\nNote that DataFrames are persisted automatically by the previous Action if later Actions need the same data. To avoid this\nbehaviour set breakDataFrameLineage=false." + }, + "mainInputId": { + "type": "string", + "description": "optional selection of main inputId used for execution mode and partition values propagation. Only needed if there are multiple input DataObject\\'s." + }, + "mainOutputId": { + "type": "string", + "description": "optional selection of main outputId used for execution mode and partition values propagation. Only needed if there are multiple output DataObject\\'s." + }, + "executionMode": { + "oneOf": [ + { + "$ref": "#/definitions/ExecutionMode/CustomMode" + }, + { + "$ref": "#/definitions/ExecutionMode/CustomPartitionMode" + }, + { + "$ref": "#/definitions/ExecutionMode/DataFrameIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/DataObjectStateIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/FailIfNoPartitionValuesMode" + }, + { + "$ref": "#/definitions/ExecutionMode/FileIncrementalMoveMode" + }, + { + "$ref": "#/definitions/ExecutionMode/KafkaStateIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/PartitionDiffMode" + }, + { + "$ref": "#/definitions/ExecutionMode/ProcessAllMode" + }, + { + "$ref": "#/definitions/ExecutionMode/SparkStreamingMode" + } + ], + "description": "optional execution mode for this Action" + }, + "executionCondition": { + "type": "object", + "properties": { + "expression": { + "type": "string", + "description": "Condition formulated as Spark SQL. The attributes available are dependent on the context." + }, + "description": { + "type": "string", + "description": "A textual description of the condition to be shown in error messages." + } + }, + "title": "Condition", + "required": [ + "expression" + ], + "additionalProperties": false, + "description": "Definition of a Spark SQL condition with description.\nThis is used for example to define failConditions of[[PartitionDiffMode]] ." + }, + "metricsFailCondition": { + "type": "string", + "description": "optional spark sql expression evaluated as where-clause against dataframe of metrics. Available columns are dataObjectId, key, value.\nIf there are any rows passing the where clause, a MetricCheckFailed exception is thrown." + }, + "metadata": { + "$ref": "#/definitions/Others/ActionMetadata" + }, + "recursiveInputIds": { + "type": "array", + "items": { + "type": "string" + }, + "description": "output of action that are used as input in the same action" + }, + "inputIdsToIgnoreFilter": { + "type": "array", + "items": { + "type": "string" + }, + "description": "optional list of input ids to ignore filter (partition values & filter clause)" + } + }, + "title": "CustomDataFrameAction", + "required": [ + "type", + "inputIds", + "outputIds" + ], + "additionalProperties": false, + "description": "This[[Action]] transforms data between many input and output DataObjects using DataFrames.\nCustomDataFrameAction allows to define transformations between n input DataObjects and m output DataObjects,\nbut is is recommended to implement n:1 or 1:m transformations, as otherwise dependencies between DataObjects might not be accurate anymore.\nThe input DataFrames might be transformed using SQL or DataFrame transformations.\nWhen chaining multiple transformers, output DataFrames of previous transformers are available as input DataFrames for later transformers by their corresponding name." + }, + "CustomFileAction": { + "type": "object", + "properties": { + "type": { + "const": "CustomFileAction" + }, + "inputId": { + "type": "string", + "description": "inputs DataObject" + }, + "outputId": { + "type": "string", + "description": "output DataObject" + }, + "transformer": { + "type": "object", + "properties": { + "className": { + "type": "string", + "description": "Optional class name to load transformer code from" + }, + "scalaFile": { + "type": "string", + "description": "Optional file where scala code for transformation is loaded from" + }, + "scalaCode": { + "type": "string", + "description": "Optional scala code for transformation" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + } + }, + "title": "CustomFileTransformerConfig", + "additionalProperties": false, + "description": "Configuration of custom file transformation between one input and one output (1:1)" + }, + "filesPerPartition": { + "type": "integer", + "description": "number of files per Spark partition" + }, + "breakFileRefLineage": { + "type": "boolean", + "description": "Stop propagating input FileRefs through action and instead get new FileRefs from DataObject according to the SubFeed\\'s partitionValue.\nThis is needed to reprocess all files of a path/partition instead of the FileRef\\'s passed from the previous Action." + }, + "executionMode": { + "oneOf": [ + { + "$ref": "#/definitions/ExecutionMode/CustomMode" + }, + { + "$ref": "#/definitions/ExecutionMode/CustomPartitionMode" + }, + { + "$ref": "#/definitions/ExecutionMode/DataFrameIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/DataObjectStateIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/FailIfNoPartitionValuesMode" + }, + { + "$ref": "#/definitions/ExecutionMode/FileIncrementalMoveMode" + }, + { + "$ref": "#/definitions/ExecutionMode/KafkaStateIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/PartitionDiffMode" + }, + { + "$ref": "#/definitions/ExecutionMode/ProcessAllMode" + }, + { + "$ref": "#/definitions/ExecutionMode/SparkStreamingMode" + } + ], + "description": "optional execution mode for this Action" + }, + "executionCondition": { + "type": "object", + "properties": { + "expression": { + "type": "string", + "description": "Condition formulated as Spark SQL. The attributes available are dependent on the context." + }, + "description": { + "type": "string", + "description": "A textual description of the condition to be shown in error messages." + } + }, + "title": "Condition", + "required": [ + "expression" + ], + "additionalProperties": false, + "description": "Definition of a Spark SQL condition with description.\nThis is used for example to define failConditions of[[PartitionDiffMode]] ." + }, + "metricsFailCondition": { + "type": "string", + "description": "optional spark sql expression evaluated as where-clause against dataframe of metrics. Available columns are dataObjectId, key, value.\nIf there are any rows passing the where clause, a MetricCheckFailed exception is thrown." + }, + "metadata": { + "$ref": "#/definitions/Others/ActionMetadata" + } + }, + "title": "CustomFileAction", + "required": [ + "type", + "inputId", + "outputId", + "transformer" + ], + "additionalProperties": false, + "description": "[[Action]] to transform files between two Hadoop Data Objects.\nThe transformation is executed in distributed mode on the Spark executors.\nA custom file transformer must be given, which reads a file from Hadoop and writes it back to Hadoop." + }, + "CustomScriptAction": { + "type": "object", + "properties": { + "type": { + "const": "CustomScriptAction" + }, + "inputIds": { + "type": "array", + "items": { + "type": "string" + }, + "description": "input DataObject\\'s" + }, + "outputIds": { + "type": "array", + "items": { + "type": "string" + }, + "description": "output DataObject\\'s" + }, + "scripts": { + "type": "array", + "items": { + "oneOf": [ + { + "$ref": "#/definitions/ParsableScriptDef/CmdScript" + }, + { + "$ref": "#/definitions/ParsableScriptDef/DockerRunScript" + } + ] + }, + "description": "definition of scripts to execute" + }, + "executionCondition": { + "type": "object", + "properties": { + "expression": { + "type": "string", + "description": "Condition formulated as Spark SQL. The attributes available are dependent on the context." + }, + "description": { + "type": "string", + "description": "A textual description of the condition to be shown in error messages." + } + }, + "title": "Condition", + "required": [ + "expression" + ], + "additionalProperties": false, + "description": "Definition of a Spark SQL condition with description.\nThis is used for example to define failConditions of[[PartitionDiffMode]] ." + }, + "metadata": { + "$ref": "#/definitions/Others/ActionMetadata" + } + }, + "title": "CustomScriptAction", + "required": [ + "type", + "inputIds", + "outputIds" + ], + "additionalProperties": false, + "description": "[[Action]] execute script after multiple input DataObjects are ready, notifying multiple output DataObjects when script succeeded.\n\nNote that this action can also be used to give your data pipeline additional structure, e.g. adding a decision point after several actions have been executed." + }, + "DeduplicateAction": { + "type": "object", + "properties": { + "type": { + "const": "DeduplicateAction" + }, + "inputId": { + "type": "string", + "description": "inputs DataObject" + }, + "outputId": { + "type": "string", + "description": "output DataObject" + }, + "transformer": { + "type": "object", + "properties": { + "className": { + "type": "string", + "description": "Optional class name implementing trait[[CustomDfTransformer]]" + }, + "scalaFile": { + "type": "string", + "description": "Optional file where scala code for transformation is loaded from. The scala code in the file needs to be a function of type[[fnTransformType]] ." + }, + "scalaCode": { + "type": "string", + "description": "Optional scala code for transformation. The scala code needs to be a function of type[[fnTransformType]] ." + }, + "sqlCode": { + "type": "string", + "description": "Optional SQL code for transformation.\nUse tokens %{} to replace with runtimeOptions in SQL code.\nExample: \\\"select * from test where run = %{runId}\\\"" + }, + "pythonFile": { + "type": "string", + "description": "Optional pythonFile to use for python transformation. The python code can use variables inputDf, dataObjectId and options. The transformed DataFrame has to be set with setOutputDf." + }, + "pythonCode": { + "type": "string", + "description": "Optional pythonCode to use for python transformation. The python code can use variables inputDf, dataObjectId and options. The transformed DataFrame has to be set with setOutputDf." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "CustomDfTransformerConfig", + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1)\nDefine a transform function which receives a DataObjectIds, a DataFrames and a map of options and has to return a\nDataFrame, see also[[CustomDfTransformer]].\n\nNote about Python transformation: Environment with Python and PySpark needed.\nPySpark session is initialize and available under variables`sc`,`session`,`sqlContext`.\nOther variables available are\n-`inputDf`: Input DataFrame\n-`options`: Transformation options as Map[String,String]\n-`dataObjectId`: Id of input dataObject as String\nOutput DataFrame must be set with`setOutputDf(df)` ." + }, + "transformers": { + "type": "array", + "items": { + "oneOf": [ + { + "$ref": "#/definitions/GenericDfTransformer/AdditionalColumnsTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/BlacklistTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/DataValidationTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/DecryptColumnsTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/EncryptColumnsTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/FilterTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/PythonCodeDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/SQLDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaClassGenericDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaClassSnowparkDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaClassSparkDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaClassSparkDsTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaCodeSparkDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaNotebookSparkDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/SparkRepartitionTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/StandardizeColNamesTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/StandardizeSparkDatatypesTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/WhitelistTransformer" + } + ] + }, + "description": "optional list of transformations to apply before deduplication. See[[sparktransformer]] for a list of included Transformers.\nThe transformations are applied according to the lists ordering." + }, + "ignoreOldDeletedColumns": { + "type": "boolean", + "description": "if true, remove no longer existing columns in Schema Evolution" + }, + "ignoreOldDeletedNestedColumns": { + "type": "boolean", + "description": "if true, remove no longer existing columns from nested data types in Schema Evolution.\nKeeping deleted columns in complex data types has performance impact as all new data\nin the future has to be converted by a complex function." + }, + "updateCapturedColumnOnlyWhenChanged": { + "type": "boolean", + "description": "Set to true to enable update Column[[TechnicalTableColumn.captured]] only if Record has changed in the source, instead of updating it with every execution (default=false).\nThis results in much less records updated with saveMode.Merge." + }, + "mergeModeEnable": { + "type": "boolean", + "description": "Set to true to use saveMode.Merge for much better performance. Output DataObject must implement[[CanMergeDataFrame]] if enabled (default = false)." + }, + "mergeModeAdditionalJoinPredicate": { + "type": "string", + "description": "To optimize performance it might be interesting to limit the records read from the existing table data, e.g. it might be sufficient to use only the last 7 days.\nSpecify a condition to select existing data to be used in transformation as Spark SQL expression.\nUse table alias \\'existing\\' to reference columns of the existing table data." + }, + "breakDataFrameLineage": { + "type": "boolean", + "description": "Stop propagating input DataFrame through action and instead get a new DataFrame from DataObject.\nThis can help to save memory and performance if the input DataFrame includes many transformations from previous Actions.\nThe new DataFrame will be initialized according to the SubFeed\\'s partitionValues." + }, + "persist": { + "type": "boolean", + "description": "Force persisting input DataFrame\\'s on Disk.\nThis improves performance if dataFrame is used multiple times in the transformation and can serve as a recovery point\nin case a task get\\'s lost.\nNote that DataFrames are persisted automatically by the previous Action if later Actions need the same data. To avoid this\nbehaviour set breakDataFrameLineage=false." + }, + "executionMode": { + "oneOf": [ + { + "$ref": "#/definitions/ExecutionMode/CustomMode" + }, + { + "$ref": "#/definitions/ExecutionMode/CustomPartitionMode" + }, + { + "$ref": "#/definitions/ExecutionMode/DataFrameIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/DataObjectStateIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/FailIfNoPartitionValuesMode" + }, + { + "$ref": "#/definitions/ExecutionMode/FileIncrementalMoveMode" + }, + { + "$ref": "#/definitions/ExecutionMode/KafkaStateIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/PartitionDiffMode" + }, + { + "$ref": "#/definitions/ExecutionMode/ProcessAllMode" + }, + { + "$ref": "#/definitions/ExecutionMode/SparkStreamingMode" + } + ], + "description": "optional execution mode for this Action" + }, + "executionCondition": { + "type": "object", + "properties": { + "expression": { + "type": "string", + "description": "Condition formulated as Spark SQL. The attributes available are dependent on the context." + }, + "description": { + "type": "string", + "description": "A textual description of the condition to be shown in error messages." + } + }, + "title": "Condition", + "required": [ + "expression" + ], + "additionalProperties": false, + "description": "Definition of a Spark SQL condition with description.\nThis is used for example to define failConditions of[[PartitionDiffMode]] ." + }, + "metricsFailCondition": { + "type": "string", + "description": "optional spark sql expression evaluated as where-clause against dataframe of metrics. Available columns are dataObjectId, key, value.\nIf there are any rows passing the where clause, a MetricCheckFailed exception is thrown." + }, + "metadata": { + "$ref": "#/definitions/Others/ActionMetadata" + } + }, + "title": "DeduplicateAction", + "required": [ + "type", + "inputId", + "outputId" + ], + "additionalProperties": false, + "description": "This[[Action]]copies and deduplicates data between an input and output DataObject using DataFrames.\nDeduplication keeps the last record for every key, also after it has been deleted in the source.\nThe DataFrame might be transformed using SQL or DataFrame transformations. These transformations are applied before the deduplication.\n\nDeduplicateAction adds an additional Column[[TechnicalTableColumn.captured]]. It contains the timestamp of the last occurrence of the record in the source.\nThis creates lots of updates. Especially when using saveMode.Merge it is better to set[[TechnicalTableColumn.captured]]to the last change of the record in the source. Use updateCapturedColumnOnlyWhenChanged = true to enable this optimization.\n\nDeduplicateAction needs a transactional table (e.g.[[TransactionalTableDataObject]]) as output with defined primary keys.\nIf output implements[[CanMergeDataFrame]] , saveMode.Merge can be enabled by setting mergeModeEnable = true. This allows for much better performance." + }, + "FileTransferAction": { + "type": "object", + "properties": { + "type": { + "const": "FileTransferAction" + }, + "inputId": { + "type": "string", + "description": "inputs DataObject" + }, + "outputId": { + "type": "string", + "description": "output DataObject" + }, + "overwrite": { + "type": "boolean", + "description": "Allow existing output file to be overwritten. If false the action will fail if a file to be created already exists. Default is true." + }, + "maxParallelism": { + "type": "integer", + "description": "Set maximum of files to be transferred in parallel.\nNote that this information can also be set on DataObjects like SFtpFileRefDataObject, resp. its SFtpFileRefConnection.\nThe FileTransferAction will then take the minimum parallelism of input, output and this attribute.\nIf parallelism is not specified on input, output and this attribute, it is set to 1." + }, + "filenameExtractorRegex": { + "type": "string", + "description": "A regex to extract a part of the filename to keep in the translated FileRef.\nIf the regex contains group definitions, the first group is taken, otherwise the whole regex match.\nDefault is None which keeps the whole filename (without path)." + }, + "breakFileRefLineage": { + "type": "boolean", + "description": "If set to true, file references passed on from previous action are ignored by this action.\nThe action will detect on its own what files it is going to process." + }, + "executionMode": { + "oneOf": [ + { + "$ref": "#/definitions/ExecutionMode/CustomMode" + }, + { + "$ref": "#/definitions/ExecutionMode/CustomPartitionMode" + }, + { + "$ref": "#/definitions/ExecutionMode/DataFrameIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/DataObjectStateIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/FailIfNoPartitionValuesMode" + }, + { + "$ref": "#/definitions/ExecutionMode/FileIncrementalMoveMode" + }, + { + "$ref": "#/definitions/ExecutionMode/KafkaStateIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/PartitionDiffMode" + }, + { + "$ref": "#/definitions/ExecutionMode/ProcessAllMode" + }, + { + "$ref": "#/definitions/ExecutionMode/SparkStreamingMode" + } + ], + "description": "optional execution mode for this Action" + }, + "executionCondition": { + "type": "object", + "properties": { + "expression": { + "type": "string", + "description": "Condition formulated as Spark SQL. The attributes available are dependent on the context." + }, + "description": { + "type": "string", + "description": "A textual description of the condition to be shown in error messages." + } + }, + "title": "Condition", + "required": [ + "expression" + ], + "additionalProperties": false, + "description": "Definition of a Spark SQL condition with description.\nThis is used for example to define failConditions of[[PartitionDiffMode]] ." + }, + "metricsFailCondition": { + "type": "string", + "description": "optional spark sql expression evaluated as where-clause against dataframe of metrics. Available columns are dataObjectId, key, value.\nIf there are any rows passing the where clause, a MetricCheckFailed exception is thrown." + }, + "metadata": { + "$ref": "#/definitions/Others/ActionMetadata" + } + }, + "title": "FileTransferAction", + "required": [ + "type", + "inputId", + "outputId" + ], + "additionalProperties": false, + "description": "[[Action]] to transfer files between SFtp, Hadoop and local Fs." + }, + "HistorizeAction": { + "type": "object", + "properties": { + "type": { + "const": "HistorizeAction" + }, + "inputId": { + "type": "string", + "description": "inputs DataObject" + }, + "outputId": { + "type": "string", + "description": "output DataObject" + }, + "transformer": { + "type": "object", + "properties": { + "className": { + "type": "string", + "description": "Optional class name implementing trait[[CustomDfTransformer]]" + }, + "scalaFile": { + "type": "string", + "description": "Optional file where scala code for transformation is loaded from. The scala code in the file needs to be a function of type[[fnTransformType]] ." + }, + "scalaCode": { + "type": "string", + "description": "Optional scala code for transformation. The scala code needs to be a function of type[[fnTransformType]] ." + }, + "sqlCode": { + "type": "string", + "description": "Optional SQL code for transformation.\nUse tokens %{} to replace with runtimeOptions in SQL code.\nExample: \\\"select * from test where run = %{runId}\\\"" + }, + "pythonFile": { + "type": "string", + "description": "Optional pythonFile to use for python transformation. The python code can use variables inputDf, dataObjectId and options. The transformed DataFrame has to be set with setOutputDf." + }, + "pythonCode": { + "type": "string", + "description": "Optional pythonCode to use for python transformation. The python code can use variables inputDf, dataObjectId and options. The transformed DataFrame has to be set with setOutputDf." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options to pass to the transformation" + }, + "runtimeOptions": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "optional tuples of [key, spark sql expression] to be added as additional options when executing transformation.\nThe spark sql expressions are evaluated against an instance of[[DefaultExpressionData]] ." + } + }, + "title": "CustomDfTransformerConfig", + "additionalProperties": false, + "description": "Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1)\nDefine a transform function which receives a DataObjectIds, a DataFrames and a map of options and has to return a\nDataFrame, see also[[CustomDfTransformer]].\n\nNote about Python transformation: Environment with Python and PySpark needed.\nPySpark session is initialize and available under variables`sc`,`session`,`sqlContext`.\nOther variables available are\n-`inputDf`: Input DataFrame\n-`options`: Transformation options as Map[String,String]\n-`dataObjectId`: Id of input dataObject as String\nOutput DataFrame must be set with`setOutputDf(df)` ." + }, + "transformers": { + "type": "array", + "items": { + "oneOf": [ + { + "$ref": "#/definitions/GenericDfTransformer/AdditionalColumnsTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/BlacklistTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/DataValidationTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/DecryptColumnsTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/EncryptColumnsTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/FilterTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/PythonCodeDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/SQLDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaClassGenericDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaClassSnowparkDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaClassSparkDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaClassSparkDsTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaCodeSparkDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/ScalaNotebookSparkDfTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/SparkRepartitionTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/StandardizeColNamesTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/StandardizeSparkDatatypesTransformer" + }, + { + "$ref": "#/definitions/GenericDfTransformer/WhitelistTransformer" + } + ] + }, + "description": "optional list of transformations to apply before historization. See[[sparktransformer]] for a list of included Transformers.\nThe transformations are applied according to the lists ordering." + }, + "filterClause": { + "type": "string", + "description": "Filter of data to be processed by historization. It can be used to exclude historical data not needed to create new history, for performance reasons.\nNote that filterClause is only applied if mergeModeEnable=false. Use mergeModeAdditionalJoinPredicate if mergeModeEnable=true to achieve a similar performance tuning." + }, + "historizeBlacklist": { + "type": "array", + "items": { + "type": "string" + }, + "description": "optional list of columns to ignore when comparing two records in historization. Can not be used together with[[historizeWhitelist]] ." + }, + "historizeWhitelist": { + "type": "array", + "items": { + "type": "string" + }, + "description": "optional final list of columns to use when comparing two records in historization. Can not be used together with[[historizeBlacklist]] ." + }, + "ignoreOldDeletedColumns": { + "type": "boolean", + "description": "if true, remove no longer existing columns in Schema Evolution" + }, + "ignoreOldDeletedNestedColumns": { + "type": "boolean", + "description": "if true, remove no longer existing columns from nested data types in Schema Evolution.\nKeeping deleted columns in complex data types has performance impact as all new data\nin the future has to be converted by a complex function." + }, + "mergeModeEnable": { + "type": "boolean", + "description": "Set to true to use saveMode.Merge for much better performance by using incremental historization.\nOutput DataObject must implement[[CanMergeDataFrame]] if enabled (default = false).\nIncremental historization will add an additional \\\"dl_hash\\\" column which is used for change detection between\nexisting and new data.\nNote that enabling mergeMode on an existing HistorizeAction will create a new version for every\nnew record in the output table, as \\\"dl_hash\\\" column is initially null." + }, + "mergeModeAdditionalJoinPredicate": { + "type": "string", + "description": "To optimize performance it might be interesting to limit the records read from the existing table data, e.g. it might be sufficient to use only the last 7 days.\nSpecify a condition to select existing data to be used in transformation as Spark SQL expression.\nUse table alias \\'existing\\' to reference columns of the existing table data." + }, + "mergeModeCDCColumn": { + "type": "string", + "description": "Optional colum holding the CDC operation to replay to enable mergeModeCDC. If CDC information is available from the source\nincremental historization can be further optimized, as the join with existing data can be omitted.\nNote that this should be enabled only, if input data contains just inserted, updated and deleted records.\nHistorizeAction in mergeModeCDC will make *no* change detection on its own, and create a new version for every inserted/updated record it receives!\nYou will also need to specify parameter mergeModeCDCDeletedValue to use this and mergeModeEnable=true.\nIncrement CDC historization will add an additional column \\\"dl_dummy\\\" to the target table,\nwhich is used to work around limitations of SQL merge statement, but \\\"dl_hash\\\" column from mergeMode is no longer needed." + }, + "mergeModeCDCDeletedValue": { + "type": "string", + "description": "Optional value of mergeModeCDCColumn that marks a record as deleted." + }, + "breakDataFrameLineage": { + "type": "boolean", + "description": "Stop propagating input DataFrame through action and instead get a new DataFrame from DataObject.\nThis can help to save memory and performance if the input DataFrame includes many transformations from previous Actions.\nThe new DataFrame will be initialized according to the SubFeed\\'s partitionValues." + }, + "persist": { + "type": "boolean", + "description": "Force persisting input DataFrame\\'s on Disk.\nThis improves performance if dataFrame is used multiple times in the transformation and can serve as a recovery point\nin case a task get\\'s lost.\nNote that DataFrames are persisted automatically by the previous Action if later Actions need the same data. To avoid this\nbehaviour set breakDataFrameLineage=false." + }, + "executionMode": { + "oneOf": [ + { + "$ref": "#/definitions/ExecutionMode/CustomMode" + }, + { + "$ref": "#/definitions/ExecutionMode/CustomPartitionMode" + }, + { + "$ref": "#/definitions/ExecutionMode/DataFrameIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/DataObjectStateIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/FailIfNoPartitionValuesMode" + }, + { + "$ref": "#/definitions/ExecutionMode/FileIncrementalMoveMode" + }, + { + "$ref": "#/definitions/ExecutionMode/KafkaStateIncrementalMode" + }, + { + "$ref": "#/definitions/ExecutionMode/PartitionDiffMode" + }, + { + "$ref": "#/definitions/ExecutionMode/ProcessAllMode" + }, + { + "$ref": "#/definitions/ExecutionMode/SparkStreamingMode" + } + ], + "description": "optional execution mode for this Action" + }, + "executionCondition": { + "type": "object", + "properties": { + "expression": { + "type": "string", + "description": "Condition formulated as Spark SQL. The attributes available are dependent on the context." + }, + "description": { + "type": "string", + "description": "A textual description of the condition to be shown in error messages." + } + }, + "title": "Condition", + "required": [ + "expression" + ], + "additionalProperties": false, + "description": "Definition of a Spark SQL condition with description.\nThis is used for example to define failConditions of[[PartitionDiffMode]] ." + }, + "metricsFailCondition": { + "type": "string", + "description": "optional spark sql expression evaluated as where-clause against dataframe of metrics. Available columns are dataObjectId, key, value.\nIf there are any rows passing the where clause, a MetricCheckFailed exception is thrown." + }, + "metadata": { + "$ref": "#/definitions/Others/ActionMetadata" + } + }, + "title": "HistorizeAction", + "required": [ + "type", + "inputId", + "outputId" + ], + "additionalProperties": false, + "description": "This[[Action]]historizes data between an input and output DataObject using DataFrames.\nHistorization creates a technical history of data by creating valid-from/to columns.\nThe DataFrame might be transformed using SQL or DataFrame transformations. These transformations are applied before the deduplication.\n\nHistorizeAction needs a transactional table (e.g. implementation of[[TransactionalTableDataObject]]) as output with defined primary keys.\n\nNormal historization join new with all existing data, and rewrites all data in output table. This is not optimal from\na performance perspective.\nIt can be optimized if output object supports[[CanMergeDataFrame]] . In that case you can\nset mergeModeEnable=true to use incremental historization, which does not rewrite all data in output table. It still needs to\njoin new data with all existing data, but uses hash values to minimize data transfer.\nIf you have change-data-capture (CDC) information available to identify deleted records, you can set\nmergeModeCDCColumn and mergeModeCDCDeletedValue to even avoid the join between new and existing data. This is optimal from\na performance perspective." + }, + "ProxyAction": { + "type": "object", + "properties": { + "type": { + "const": "ProxyAction" + }, + "wrappedAction": { + "oneOf": [ + { + "$ref": "#/definitions/Action/CustomFileAction" + }, + { + "$ref": "#/definitions/Action/CustomScriptAction" + }, + { + "$ref": "#/definitions/Action/HistorizeAction" + } + ] + }, + "agent": { + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "const": "AzureRelayAgent" + }, + "url": { + "type": "string" + }, + "connections": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "const": "HiveTableConnection" + }, + "db": { + "type": "string", + "description": "hive db" + }, + "pathPrefix": { + "type": "string", + "description": "optional schema, authority and base path for tables directory on hadoop." + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "HiveTableConnection", + "required": [ + "db", + "type" + ], + "additionalProperties": false, + "description": "Connection information for hive tables" + }, + { + "type": "object", + "properties": { + "type": { + "const": "DeltaLakeTableConnection" + }, + "catalog": { + "type": "string", + "description": "optional catalog to be used for this connection" + }, + "db": { + "type": "string", + "description": "hive db" + }, + "pathPrefix": { + "type": "string", + "description": "schema, authority and base path for tables directory on hadoop" + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "checkDeltaLakeSparkOptions": { + "type": "boolean" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "DeltaLakeTableConnection", + "required": [ + "db", + "pathPrefix", + "type" + ], + "additionalProperties": false, + "description": "Connection information for DeltaLake tables" + }, + { + "type": "object", + "properties": { + "type": { + "const": "SFtpFileRefConnection" + }, + "host": { + "type": "string", + "description": "sftp host" + }, + "port": { + "type": "integer", + "description": "port of sftp service, default is 22" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "authentication information: for now BasicAuthMode and PublicKeyAuthMode are supported." + }, + "proxy": { + "type": "object", + "properties": { + "host": { + "type": "string", + "description": "proxy host" + }, + "port": { + "type": "integer", + "description": "proxy port" + }, + "proxyType": { + "type": "string", + "description": "Type of proxy: HTTP or SOCKS. Default is HTTP.", + "enum": [ + "DIRECT", + "HTTP", + "SOCKS" + ] + } + }, + "title": "JavaNetProxyConfig", + "required": [ + "host", + "port" + ], + "additionalProperties": false, + "description": "Proxy configuration to create java.net.Proxy instance." + }, + "ignoreHostKeyVerification": { + "type": "boolean", + "description": "do not validate host key if true, default is false" + }, + "maxParallelConnections": { + "type": "integer", + "description": "number of parallel sftp connections created by an instance of this connection" + }, + "connectionPoolMaxIdleTimeSec": { + "type": "integer", + "description": "timeout to close unused connections in the pool" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "SFtpFileRefConnection", + "required": [ + "host", + "authMode", + "type" + ], + "additionalProperties": false, + "description": "SFTP Connection information" + }, + { + "type": "object", + "properties": { + "type": { + "const": "HadoopFileConnection" + }, + "pathPrefix": { + "type": "string", + "description": "schema, authority and base path for accessing files on hadoop" + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "HadoopFileConnection", + "required": [ + "pathPrefix", + "type" + ], + "additionalProperties": false, + "description": "Connection information for files on hadoop" + }, + { + "type": "object", + "properties": { + "type": { + "const": "KafkaConnection" + }, + "brokers": { + "type": "string", + "description": "comma separated list of kafka bootstrap server incl. port, e.g. \\\"host1:9092,host2:9092:" + }, + "schemaRegistry": { + "type": "string", + "description": "url of schema registry service, e.g. \\\"https://host2\\\"" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options for the Kafka stream reader (see https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html)" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ] + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "KafkaConnection", + "required": [ + "brokers", + "type" + ], + "additionalProperties": false, + "description": "Connection information for kafka" + }, + { + "type": "object", + "properties": { + "type": { + "const": "SnowflakeConnection" + }, + "url": { + "type": "string", + "description": "snowflake connection url" + }, + "warehouse": { + "type": "string", + "description": "Snowflake namespace" + }, + "database": { + "type": "string", + "description": "Snowflake database" + }, + "role": { + "type": "string", + "description": "Snowflake role" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "optional authentication information: for now BasicAuthMode is supported." + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "SnowflakeConnection", + "required": [ + "url", + "warehouse", + "database", + "role", + "authMode", + "type" + ], + "additionalProperties": false, + "description": "Connection information for Snowflake databases.\nThe connection can be used for SnowflakeTableDataObjects\nIf multiple SnowflakeTableDataObjects share a connection, they share the same Snowpark session" + }, + { + "type": "object", + "properties": { + "type": { + "const": "JdbcTableConnection" + }, + "url": { + "type": "string", + "description": "jdbc connection url" + }, + "driver": { + "type": "string", + "description": "class name of jdbc driver" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "optional authentication information: for now BasicAuthMode is supported." + }, + "db": { + "type": "string", + "description": "jdbc database" + }, + "maxParallelConnections": { + "type": "integer", + "description": "max number of parallel jdbc connections created by an instance of this connection, default is 3\nNote that Spark manages JDBC Connections on its own. This setting only applies to JDBC connection\nused by SDL for validating metadata or pre/postSQL." + }, + "connectionPoolMaxIdleTimeSec": { + "type": "integer", + "description": "timeout to close unused connections in the pool" + }, + "connectionPoolMaxWaitTimeSec": { + "type": "integer", + "description": "timeout when waiting for connection in pool to become available. Default is 600 seconds (10 minutes)." + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + }, + "autoCommit": { + "type": "boolean", + "description": "flag to enable or disable the auto-commit behaviour. When autoCommit is enabled, each database request is executed in its own transaction.\nDefault is autoCommit = false. It is not recommended to enable autoCommit as it will deactivate any transactional behaviour.", + "deprecated": true + }, + "connectionInitSql": { + "type": "string", + "description": "SQL statement to be executed every time a new connection is created, for example to set session parameters" + } + }, + "title": "JdbcTableConnection", + "required": [ + "url", + "driver", + "type" + ], + "additionalProperties": false, + "description": "Connection information for jdbc tables.\nIf authentication is needed, user and password must be provided." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SplunkConnection" + }, + "host": { + "type": "string", + "description": "" + }, + "port": { + "type": "integer", + "description": "" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "SplunkConnection", + "required": [ + "host", + "port", + "authMode", + "type" + ], + "additionalProperties": false, + "description": "Connection information for splunk queries" + } + ] + } + } + }, + "title": "AzureRelayAgent", + "required": [ + "url", + "connections", + "type" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "const": "JettyAgent" + }, + "url": { + "type": "string" + }, + "connections": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "const": "HiveTableConnection" + }, + "db": { + "type": "string", + "description": "hive db" + }, + "pathPrefix": { + "type": "string", + "description": "optional schema, authority and base path for tables directory on hadoop." + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "HiveTableConnection", + "required": [ + "db", + "type" + ], + "additionalProperties": false, + "description": "Connection information for hive tables" + }, + { + "type": "object", + "properties": { + "type": { + "const": "DeltaLakeTableConnection" + }, + "catalog": { + "type": "string", + "description": "optional catalog to be used for this connection" + }, + "db": { + "type": "string", + "description": "hive db" + }, + "pathPrefix": { + "type": "string", + "description": "schema, authority and base path for tables directory on hadoop" + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "checkDeltaLakeSparkOptions": { + "type": "boolean" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "DeltaLakeTableConnection", + "required": [ + "db", + "pathPrefix", + "type" + ], + "additionalProperties": false, + "description": "Connection information for DeltaLake tables" + }, + { + "type": "object", + "properties": { + "type": { + "const": "SFtpFileRefConnection" + }, + "host": { + "type": "string", + "description": "sftp host" + }, + "port": { + "type": "integer", + "description": "port of sftp service, default is 22" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "authentication information: for now BasicAuthMode and PublicKeyAuthMode are supported." + }, + "proxy": { + "type": "object", + "properties": { + "host": { + "type": "string", + "description": "proxy host" + }, + "port": { + "type": "integer", + "description": "proxy port" + }, + "proxyType": { + "type": "string", + "description": "Type of proxy: HTTP or SOCKS. Default is HTTP.", + "enum": [ + "DIRECT", + "HTTP", + "SOCKS" + ] + } + }, + "title": "JavaNetProxyConfig", + "required": [ + "host", + "port" + ], + "additionalProperties": false, + "description": "Proxy configuration to create java.net.Proxy instance." + }, + "ignoreHostKeyVerification": { + "type": "boolean", + "description": "do not validate host key if true, default is false" + }, + "maxParallelConnections": { + "type": "integer", + "description": "number of parallel sftp connections created by an instance of this connection" + }, + "connectionPoolMaxIdleTimeSec": { + "type": "integer", + "description": "timeout to close unused connections in the pool" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "SFtpFileRefConnection", + "required": [ + "host", + "authMode", + "type" + ], + "additionalProperties": false, + "description": "SFTP Connection information" + }, + { + "type": "object", + "properties": { + "type": { + "const": "HadoopFileConnection" + }, + "pathPrefix": { + "type": "string", + "description": "schema, authority and base path for accessing files on hadoop" + }, + "acl": { + "type": "object", + "properties": { + "permission": { + "type": "string", + "description": ": File system permission string in symbolic notation form (e.g. rwxr-xr-x)" + }, + "acls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aclType": { + "type": "string", + "description": ": type of ACL to be added \\\"group\\\", \\\"user\\\"" + }, + "name": { + "type": "string", + "description": ": the name of the user/group for which an ACL definition is being added" + }, + "permission": { + "type": "string", + "description": ": the permission (rwx syntax) to be granted" + } + }, + "title": "AclElement", + "required": [ + "aclType", + "name", + "permission" + ], + "additionalProperties": false, + "description": "Describes a single extended ACL to be applied to a Data Object\nin addition to the basic file system permissions" + }, + "description": ": a sequence of[[AclElement]] s" + } + }, + "title": "AclDef", + "required": [ + "permission", + "acls" + ], + "additionalProperties": false, + "description": "Describes a complete ACL Specification (basic owner/group/other permissions AND extended ACLS)\nto be applied to a Data Object on writing" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "HadoopFileConnection", + "required": [ + "pathPrefix", + "type" + ], + "additionalProperties": false, + "description": "Connection information for files on hadoop" + }, + { + "type": "object", + "properties": { + "type": { + "const": "KafkaConnection" + }, + "brokers": { + "type": "string", + "description": "comma separated list of kafka bootstrap server incl. port, e.g. \\\"host1:9092,host2:9092:" + }, + "schemaRegistry": { + "type": "string", + "description": "url of schema registry service, e.g. \\\"https://host2\\\"" + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options for the Kafka stream reader (see https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html)" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ] + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "KafkaConnection", + "required": [ + "brokers", + "type" + ], + "additionalProperties": false, + "description": "Connection information for kafka" + }, + { + "type": "object", + "properties": { + "type": { + "const": "SnowflakeConnection" + }, + "url": { + "type": "string", + "description": "snowflake connection url" + }, + "warehouse": { + "type": "string", + "description": "Snowflake namespace" + }, + "database": { + "type": "string", + "description": "Snowflake database" + }, + "role": { + "type": "string", + "description": "Snowflake role" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "optional authentication information: for now BasicAuthMode is supported." + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "SnowflakeConnection", + "required": [ + "url", + "warehouse", + "database", + "role", + "authMode", + "type" + ], + "additionalProperties": false, + "description": "Connection information for Snowflake databases.\nThe connection can be used for SnowflakeTableDataObjects\nIf multiple SnowflakeTableDataObjects share a connection, they share the same Snowpark session" + }, + { + "type": "object", + "properties": { + "type": { + "const": "JdbcTableConnection" + }, + "url": { + "type": "string", + "description": "jdbc connection url" + }, + "driver": { + "type": "string", + "description": "class name of jdbc driver" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "optional authentication information: for now BasicAuthMode is supported." + }, + "db": { + "type": "string", + "description": "jdbc database" + }, + "maxParallelConnections": { + "type": "integer", + "description": "max number of parallel jdbc connections created by an instance of this connection, default is 3\nNote that Spark manages JDBC Connections on its own. This setting only applies to JDBC connection\nused by SDL for validating metadata or pre/postSQL." + }, + "connectionPoolMaxIdleTimeSec": { + "type": "integer", + "description": "timeout to close unused connections in the pool" + }, + "connectionPoolMaxWaitTimeSec": { + "type": "integer", + "description": "timeout when waiting for connection in pool to become available. Default is 600 seconds (10 minutes)." + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + }, + "autoCommit": { + "type": "boolean", + "description": "flag to enable or disable the auto-commit behaviour. When autoCommit is enabled, each database request is executed in its own transaction.\nDefault is autoCommit = false. It is not recommended to enable autoCommit as it will deactivate any transactional behaviour.", + "deprecated": true + }, + "connectionInitSql": { + "type": "string", + "description": "SQL statement to be executed every time a new connection is created, for example to set session parameters" + } + }, + "title": "JdbcTableConnection", + "required": [ + "url", + "driver", + "type" + ], + "additionalProperties": false, + "description": "Connection information for jdbc tables.\nIf authentication is needed, user and password must be provided." + }, + { + "type": "object", + "properties": { + "type": { + "const": "SplunkConnection" + }, + "host": { + "type": "string", + "description": "" + }, + "port": { + "type": "integer", + "description": "" + }, + "authMode": { + "oneOf": [ + { + "$ref": "#/definitions/AuthMode/AuthHeaderMode" + }, + { + "$ref": "#/definitions/AuthMode/BasicAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/CustomHttpAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/KeycloakClientSecretAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/PublicKeyAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SASLSCRAMAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/SSLCertsAuthMode" + }, + { + "$ref": "#/definitions/AuthMode/TokenAuthMode" + } + ], + "description": "" + }, + "metadata": { + "$ref": "#/definitions/Others/ConnectionMetadata" + } + }, + "title": "SplunkConnection", + "required": [ + "host", + "port", + "authMode", + "type" + ], + "additionalProperties": false, + "description": "Connection information for splunk queries" + } + ] + } + } + }, + "title": "JettyAgent", + "required": [ + "url", + "connections", + "type" + ], + "additionalProperties": false + } + ] + } + }, + "title": "ProxyAction", + "required": [ + "type", + "wrappedAction", + "agent" + ], + "additionalProperties": false, + "description": "Allows to execute the action defined by\n\nOTHERTAG: on a remote agent defined by\n\nOTHERTAG: .\nIf the execution of\n\nOTHERTAG: is successful, the ProxyAction will return an empty SparkSubFeed by the correct schema." + } + } + }, + "properties": { + "global": { + "type": "object", + "properties": { + "kryoClasses": { + "type": "array", + "items": { + "type": "string" + }, + "description": "classes to register for spark kryo serialization" + }, + "sparkOptions": { + "type": "object", + "additionalProperties": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "description": "spark options\n\nThe value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "statusInfo": { + "type": "object", + "properties": { + "port": { + "type": "integer", + "description": ": port with which the first connection attempt is made" + }, + "maxPortRetries": { + "type": "integer", + "description": ": If port is already in use, we will increment port by one and try with that new port.\nmaxPortRetries describes how many times this should be attempted. If set to 0 it will not be attempted.\nValues below 0 are not allowed." + }, + "stopOnEnd": { + "type": "boolean", + "description": ": Set to false if the Server should remain online even after SDL has finished its execution.\nIn that case, the Application needs to be stopped manually. Useful for debugging." + } + }, + "title": "StatusInfoConfig", + "additionalProperties": false, + "description": "Configuration for the Server that provides live status info of the current DAG Execution" + }, + "enableHive": { + "type": "boolean", + "description": "enable hive for spark session" + }, + "memoryLogTimer": { + "type": "object", + "properties": { + "intervalSec": { + "type": "integer", + "description": "interval in seconds between memory usage logs" + }, + "logLinuxMem": { + "type": "boolean", + "description": "enable logging linux memory" + }, + "logLinuxCGroupMem": { + "type": "boolean", + "description": "enable logging details about linux cgroup memory" + }, + "logBuffers": { + "type": "boolean", + "description": "enable logging details about different jvm buffers" + } + }, + "title": "MemoryLogTimerConfig", + "required": [ + "intervalSec" + ], + "additionalProperties": false, + "description": "Configuration for periodic memory usage logging" + }, + "shutdownHookLogger": { + "type": "boolean", + "description": "enable shutdown hook logger to trace shutdown cause" + }, + "stateListeners": { + "type": "array", + "items": { + "type": "object", + "properties": { + "className": { + "type": "string", + "description": "fully qualified class name of class implementing StateListener interface. The class needs a constructor with one parameter`options: Map[String,String]` ." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "description": "Options are passed to StateListener constructor.\n\nThe value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + } + }, + "title": "StateListenerConfig", + "required": [ + "className" + ], + "additionalProperties": false, + "description": "Configuration to notify interested parties about action results & metric" + }, + "description": "Define state listeners to be registered for receiving events of the execution of SmartDataLake job" + }, + "sparkUDFs": { + "type": "object", + "additionalProperties": { + "type": "object", + "properties": { + "className": { + "type": "string", + "description": "fully qualified class name of class implementing SparkUDFCreator interface. The class needs a constructor without parameters." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options are passed to SparkUDFCreator apply method." + } + }, + "title": "SparkUDFCreatorConfig", + "required": [ + "className" + ], + "additionalProperties": false, + "description": "Configuration to register a UserDefinedFunction in the spark session of SmartDataLake." + }, + "description": "Define UDFs to be registered in spark session. The registered UDFs are available in Spark SQL transformations\nand expression evaluation, e.g. configuration of ExecutionModes." + }, + "pythonUDFs": { + "type": "object", + "additionalProperties": { + "type": "object", + "properties": { + "pythonFile": { + "type": "string", + "description": "Optional pythonFile to use for python UDF." + }, + "pythonCode": { + "type": "string", + "description": "Optional pythonCode to use for python UDF." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options are available in your python code as variable options." + } + }, + "title": "PythonUDFCreatorConfig", + "additionalProperties": false, + "description": "Configuration to register a Python UDF in the spark session of SmartDataLake.\nDefine a python function with type hints i python code and register it in global configuration.\nThe name of the function must match the name you use to declare it in GlobalConf.\nThe Python function can then be used in Spark SQL expressions." + }, + "description": "Define UDFs in python to be registered in spark session. The registered UDFs are available in Spark SQL transformations\nbut not for expression evaluation." + }, + "secretProviders": { + "type": "object", + "additionalProperties": { + "type": "object", + "properties": { + "className": { + "type": "string", + "description": "fully qualified class name of class implementing SecretProvider interface. The class needs a constructor with parameter \\\"options: Map[String,String]\\\"." + }, + "options": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Options are passed to SecretProvider apply method." + } + }, + "title": "SecretProviderConfig", + "required": [ + "className" + ], + "additionalProperties": false, + "description": "Configuration to register a SecretProvider." + }, + "description": "Define SecretProvider\\'s to be registered." + }, + "allowOverwriteAllPartitionsWithoutPartitionValues": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Configure a list of exceptions for partitioned DataObject id\\'s,\nwhich are allowed to overwrite the all partitions of a table if no partition values are set.\nThis is used to override/avoid a protective error when using SDLSaveMode.OverwriteOptimized|OverwritePreserveDirectories.\nDefine it as a list of DataObject id\\'s." + }, + "allowAsRecursiveInput": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of DataObjects for which the validation rules for Action.recursiveInputIds are *not* checked.\nThe validation rules are\n1) that recursive input DataObjects must also be listed in output DataObjects of the same action\n2) the DataObject must implement TransactionalSparkTableDataObject interface\nListing a DataObject in allowAsRecursiveInput can be used for well thought exceptions, but should be avoided in general.\nNote that if 1) is true, also 2) must be fullfilled for Spark to work properly (because Spark can\\'t read/write the same storage location in the same job),\nbut there might be cases with recursions with different Actions involved, that dont need to fullfill 2)." + }, + "synchronousStreamingTriggerIntervalSec": { + "type": "integer", + "description": "Trigger interval for synchronous actions in streaming mode in seconds (default = 60 seconds)\nThe synchronous actions of the DAG will be executed with this interval if possile.\nNote that for asynchronous actions there are separate settings, e.g. SparkStreamingMode.triggerInterval." + }, + "environment": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Override environment settings defined in Environment object by setting the corresponding key to the desired value (key in camelcase notation with the first letter in lowercase)" + }, + "pluginOptions": { + "type": "object", + "additionalProperties": { + "type": "string", + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + }, + "description": "The value can be provided in plaintext or with the id of a secret provider and the\nname of the secret in the format\n```#######```,\ne.g.\n```###ENV####```\nto get a secret from an environment variable." + } + }, + "title": "GlobalConfig", + "additionalProperties": false, + "description": "Global configuration options\n\nNote that global configuration is responsible to hold SparkSession, so that its created once and only once per SDLB job.\nThis is especially important if JVM is shared between different SDL jobs (e.g. Databricks cluster), because sharing SparkSession in object Environment survives the current SDLB job." + }, + "connections": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "$ref": "#/definitions/Connection/DeltaLakeTableConnection" + }, + { + "$ref": "#/definitions/Connection/HadoopFileConnection" + }, + { + "$ref": "#/definitions/Connection/HiveTableConnection" + }, + { + "$ref": "#/definitions/Connection/JdbcTableConnection" + }, + { + "$ref": "#/definitions/Connection/KafkaConnection" + }, + { + "$ref": "#/definitions/Connection/SFtpFileRefConnection" + }, + { + "$ref": "#/definitions/Connection/SnowflakeConnection" + }, + { + "$ref": "#/definitions/Connection/SplunkConnection" + } + ], + "description": "Map Connection name : definition" + } + }, + "dataObjects": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "$ref": "#/definitions/DataObject/AccessTableDataObject" + }, + { + "$ref": "#/definitions/DataObject/ActionsExporterDataObject" + }, + { + "$ref": "#/definitions/DataObject/AirbyteDataObject" + }, + { + "$ref": "#/definitions/DataObject/AvroFileDataObject" + }, + { + "$ref": "#/definitions/DataObject/CsvFileDataObject" + }, + { + "$ref": "#/definitions/DataObject/CustomDfDataObject" + }, + { + "$ref": "#/definitions/DataObject/CustomFileDataObject" + }, + { + "$ref": "#/definitions/DataObject/DataObjectsExporterDataObject" + }, + { + "$ref": "#/definitions/DataObject/DeltaLakeTableDataObject" + }, + { + "$ref": "#/definitions/DataObject/ExcelFileDataObject" + }, + { + "$ref": "#/definitions/DataObject/HiveTableDataObject" + }, + { + "$ref": "#/definitions/DataObject/JdbcTableDataObject" + }, + { + "$ref": "#/definitions/DataObject/JmsDataObject" + }, + { + "$ref": "#/definitions/DataObject/JsonFileDataObject" + }, + { + "$ref": "#/definitions/DataObject/KafkaTopicDataObject" + }, + { + "$ref": "#/definitions/DataObject/PKViolatorsDataObject" + }, + { + "$ref": "#/definitions/DataObject/ParquetFileDataObject" + }, + { + "$ref": "#/definitions/DataObject/RawFileDataObject" + }, + { + "$ref": "#/definitions/DataObject/RelaxedCsvFileDataObject" + }, + { + "$ref": "#/definitions/DataObject/SFtpFileRefDataObject" + }, + { + "$ref": "#/definitions/DataObject/SnowflakeTableDataObject" + }, + { + "$ref": "#/definitions/DataObject/SplunkDataObject" + }, + { + "$ref": "#/definitions/DataObject/TickTockHiveTableDataObject" + }, + { + "$ref": "#/definitions/DataObject/WebserviceFileDataObject" + }, + { + "$ref": "#/definitions/DataObject/XmlFileDataObject" + } + ], + "description": "Map of DataObject name and definition" + } + }, + "actions": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "$ref": "#/definitions/Action/CopyAction" + }, + { + "$ref": "#/definitions/Action/CustomDataFrameAction" + }, + { + "$ref": "#/definitions/Action/CustomFileAction" + }, + { + "$ref": "#/definitions/Action/CustomScriptAction" + }, + { + "$ref": "#/definitions/Action/DeduplicateAction" + }, + { + "$ref": "#/definitions/Action/FileTransferAction" + }, + { + "$ref": "#/definitions/Action/HistorizeAction" + }, + { + "$ref": "#/definitions/Action/ProxyAction" + } + ], + "description": "Map of Action name and definition" + } + } + }, + "required": [ + "dataObjects", + "actions" + ], + "additionalProperties": true +} \ No newline at end of file diff --git a/src/test/resources/playground/basic.conf b/src/test/resources/playground/basic.conf index 4fc92db..b9028a9 100644 --- a/src/test/resources/playground/basic.conf +++ b/src/test/resources/playground/basic.conf @@ -4,5 +4,5 @@ dataObjects { } actions { - -} \ No newline at end of file + agentId = bla +} diff --git a/src/test/resources/playground/demo.conf b/src/test/resources/playground/demo.conf new file mode 100644 index 0000000..84eaef0 --- /dev/null +++ b/src/test/resources/playground/demo.conf @@ -0,0 +1,36 @@ +actions { + + join-departures-airports { + type = CustomDataFrameAction + + inputIds = [stg-departures, int-airports] + transformer = { + type = SQLDfsTransformer + code = { + btl-connected-airports = "select stg_departures.estdepartureairport, stg_departures.estarrivalairport, airports.* from stg_departures join int_airports airports on stg_departures.estArrivalAirport = airports.ident" + } + } + } + + compute-distances { + type = CopyAction + + code = { + btl-departures-arrivals-airports = "select btl_connected_airports.estdepartureairport, btl_connected_airports.estarrivalairport, btl_connected_airports.name as arr_name, btl_connected_airports.latitude_deg as arr_latitude_deg, btl_connected_airports.longitude_deg as arr_longitude_deg, airports.name as dep_name, airports.latitude_deg as dep_latitude_deg, airports.longitude_deg as dep_longitude_deg from btl_connected_airports join int_airports airports on btl_connected_airports.estdepartureairport = airports.ident" + } + metadata { + feed = compute + } + } + + download-airports { + + inputId = ext-airports + } + +} + +dataObjects { + + +} \ No newline at end of file diff --git a/src/test/scala/io/smartdatalake/completion/SDLBCompletionEngineSpec.scala b/src/test/scala/io/smartdatalake/completion/SDLBCompletionEngineSpec.scala new file mode 100644 index 0000000..6b11f69 --- /dev/null +++ b/src/test/scala/io/smartdatalake/completion/SDLBCompletionEngineSpec.scala @@ -0,0 +1,22 @@ +package io.smartdatalake.completion + +import io.smartdatalake.UnitSpec +import io.smartdatalake.context.SDLBContext +import ujson.* + +import scala.io.Source +import scala.util.Using + +class SDLBCompletionEngineSpec extends UnitSpec { + + val completionEngine = new SDLBCompletionEngineImpl + + "SDLB Completion engine" should "retrieve all the properties of copyAction" in { + val context = SDLBContext.createContext(loadFile("fixture/hocon/with-multi-lines-flattened-example.conf"), 16, 0) + println(context.parentPath + " " + context.parentWord) + println(completionEngine.generateCompletionItems(context)) + //TODO + } + + +} diff --git a/src/test/scala/io/smartdatalake/completion/schema/SchemaReaderSpec.scala b/src/test/scala/io/smartdatalake/completion/schema/SchemaReaderSpec.scala new file mode 100644 index 0000000..ff84fee --- /dev/null +++ b/src/test/scala/io/smartdatalake/completion/schema/SchemaReaderSpec.scala @@ -0,0 +1,67 @@ +package io.smartdatalake.completion.schema + +import io.smartdatalake.UnitSpec +import ujson.* + +import scala.io.Source +import scala.util.Using + +class SchemaReaderSpec extends UnitSpec { + + val schemaReader = new SchemaReaderImpl("fixture/sdl-schema/sdl-schema-2.5.0.json") + + "Schema Reader" should "retrieve all the properties of copyAction" in { + val actual = schemaReader.retrieveActionProperties("CopyAction").toList + val expected = List( + SchemaItem("type", ItemType.STRING, """"""), + SchemaItem("inputId", ItemType.STRING, """inputs DataObject"""), + SchemaItem("outputId", ItemType.STRING, """output DataObject"""), + SchemaItem("deleteDataAfterRead", ItemType.BOOLEAN, """a flag to enable deletion of input partitions after copying."""), + SchemaItem("transformer", ItemType.OBJECT, + """Configuration of a custom Spark-DataFrame transformation between one input and one output (1:1) + |Define a transform function which receives a DataObjectIds, a DataFrames and a map of options and has to return a + |DataFrame, see also[[CustomDfTransformer]]. + | + |Note about Python transformation: Environment with Python and PySpark needed. + |PySpark session is initialize and available under variables`sc`,`session`,`sqlContext`. + |Other variables available are + |-`inputDf`: Input DataFrame + |-`options`: Transformation options as Map[String,String] + |-`dataObjectId`: Id of input dataObject as String + |Output DataFrame must be set with`setOutputDf(df)` .""".stripMargin), + SchemaItem("transformers", ItemType.ARRAY, + """optional list of transformations to apply. See[[spark.transformer]] for a list of included Transformers. + |The transformations are applied according to the lists ordering.""".stripMargin), + SchemaItem("breakDataFrameLineage", ItemType.BOOLEAN, + """Stop propagating input DataFrame through action and instead get a new DataFrame from DataObject. + |This can help to save memory and performance if the input DataFrame includes many transformations from previous Actions. + |The new DataFrame will be initialized according to the SubFeed\'s partitionValues.""".stripMargin), + SchemaItem("persist", ItemType.BOOLEAN, + """Force persisting input DataFrame\'s on Disk. + |This improves performance if dataFrame is used multiple times in the transformation and can serve as a recovery point + |in case a task get\'s lost. + |Note that DataFrames are persisted automatically by the previous Action if later Actions need the same data. To avoid this + |behaviour set breakDataFrameLineage=false.""".stripMargin), + SchemaItem("executionMode", ItemType.STRING, """optional execution mode for this Action"""), + SchemaItem("executionCondition", ItemType.OBJECT, + """Definition of a Spark SQL condition with description. + |This is used for example to define failConditions of[[PartitionDiffMode]] .""".stripMargin), + SchemaItem("metricsFailCondition", ItemType.STRING, + """optional spark sql expression evaluated as where-clause against dataframe of metrics. Available columns are dataObjectId, key, value. + |If there are any rows passing the where clause, a MetricCheckFailed exception is thrown.""".stripMargin), + SchemaItem("saveModeOptions", ItemType.STRING, """override and parametrize saveMode set in output DataObject configurations when writing to DataObjects."""), + SchemaItem("metadata", ItemType.STRING, """"""), + SchemaItem("agentId", ItemType.STRING, """""") + ) + + actual shouldBe expected + + } + + it should "do something" in { + val actual = schemaReader.retrieveActionProperties("CustomDataFrameAction") + println(actual) + } + + +} diff --git a/src/test/scala/io/smartdatalake/context/SDLBContextSpec.scala b/src/test/scala/io/smartdatalake/context/SDLBContextSpec.scala index 27cf7ae..582d994 100644 --- a/src/test/scala/io/smartdatalake/context/SDLBContextSpec.scala +++ b/src/test/scala/io/smartdatalake/context/SDLBContextSpec.scala @@ -10,7 +10,7 @@ import scala.util.Using class SDLBContextSpec extends UnitSpec { - private val text: String = loadFile("fixture/with-multi-lines-example.conf") + private val text: String = loadFile("fixture/hocon/with-multi-lines-example.conf") "Smart DataLake Builder Context" should "creates a context with empty config if text is empty" in { @@ -22,18 +22,18 @@ class SDLBContextSpec extends UnitSpec { } it should "uses the empty context if line is invalid" in { - val text = loadFile("fixture/basic-example.conf") + val text = loadFile("fixture/hocon/basic-example.conf") SDLBContext.createContext(text, 0, 1) shouldBe SDLBContext.EMPTY_CONTEXT SDLBContext.createContext(text, 23, 1) shouldBe SDLBContext.EMPTY_CONTEXT } it should "uses the empty context if col is invalid" in { - val text = loadFile("fixture/basic-example.conf") + val text = loadFile("fixture/hocon/basic-example.conf") SDLBContext.createContext(text, 1, -1) shouldBe SDLBContext.EMPTY_CONTEXT } it should "creates a context correctly with a basic example" in { - val text = loadFile("fixture/basic-example.conf") + val text = loadFile("fixture/hocon/basic-example.conf") val line1Start = SDLBContext.createContext(text, 1, 0) line1Start.parentPath shouldBe "" line1Start.parentWord shouldBe "" diff --git a/src/test/scala/io/smartdatalake/context/hocon/HoconParserSpec.scala b/src/test/scala/io/smartdatalake/context/hocon/HoconParserSpec.scala index e9f5d9f..3cc36fb 100644 --- a/src/test/scala/io/smartdatalake/context/hocon/HoconParserSpec.scala +++ b/src/test/scala/io/smartdatalake/context/hocon/HoconParserSpec.scala @@ -15,7 +15,7 @@ class HoconParserSpec extends UnitSpec { case class Fixture(originalText: String, text: String, config: Config) "Hocon parser" should "find path in hocon file" in { - val fixture = loadFixture("fixture/basic-example.conf") + val fixture = loadFixture("fixture/hocon/basic-example.conf") val leftCaretData = List( CaretData(1, leftCol, 0, "", ""), @@ -40,7 +40,7 @@ class HoconParserSpec extends UnitSpec { } it should "find path in file with comments" in { - val fixture = loadFixture("fixture/with-comments-example.conf") + val fixture = loadFixture("fixture/hocon/with-comments-example.conf") val leftCaretData = List( CaretData(1, leftCol, 0, "", ""), @@ -74,7 +74,7 @@ class HoconParserSpec extends UnitSpec { } it should "find path in with multi-line values" in { - val fixture = loadFixture("fixture/with-multi-lines-example.conf") + val fixture = loadFixture("fixture/hocon/with-multi-lines-example.conf") val positionMap = MultiLineTransformer.computeCorrectedPositions(fixture.originalText) val leftCaretData = List( @@ -145,7 +145,7 @@ class HoconParserSpec extends UnitSpec { } it should "find path in file with lists" in { //TODO not correct yet - val fixture = loadFixture("fixture/with-lists-example.conf") + val fixture = loadFixture("fixture/hocon/with-lists-example.conf") val leftCaretData = List( diff --git a/src/test/scala/io/smartdatalake/context/languageserver/SmartDataLakeLanguageServerSpec.scala b/src/test/scala/io/smartdatalake/languageserver/SmartDataLakeLanguageServerSpec.scala similarity index 93% rename from src/test/scala/io/smartdatalake/context/languageserver/SmartDataLakeLanguageServerSpec.scala rename to src/test/scala/io/smartdatalake/languageserver/SmartDataLakeLanguageServerSpec.scala index 970f735..2d48e92 100644 --- a/src/test/scala/io/smartdatalake/context/languageserver/SmartDataLakeLanguageServerSpec.scala +++ b/src/test/scala/io/smartdatalake/languageserver/SmartDataLakeLanguageServerSpec.scala @@ -1,4 +1,4 @@ -package io.smartdatalake.context.languageserver +package io.smartdatalake.languageserver import io.smartdatalake.UnitSpec import io.smartdatalake.languageserver.SmartDataLakeLanguageServer diff --git a/src/test/scala/io/smartdatalake/context/languageserver/SmartDataLakeTextDocumentServiceSpec.scala b/src/test/scala/io/smartdatalake/languageserver/SmartDataLakeTextDocumentServiceSpec.scala similarity index 87% rename from src/test/scala/io/smartdatalake/context/languageserver/SmartDataLakeTextDocumentServiceSpec.scala rename to src/test/scala/io/smartdatalake/languageserver/SmartDataLakeTextDocumentServiceSpec.scala index 625f083..bb75015 100644 --- a/src/test/scala/io/smartdatalake/context/languageserver/SmartDataLakeTextDocumentServiceSpec.scala +++ b/src/test/scala/io/smartdatalake/languageserver/SmartDataLakeTextDocumentServiceSpec.scala @@ -1,4 +1,4 @@ -package io.smartdatalake.context.languageserver +package io.smartdatalake.languageserver import io.smartdatalake.UnitSpec import io.smartdatalake.languageserver.SmartDataLakeTextDocumentService @@ -10,7 +10,7 @@ class SmartDataLakeTextDocumentServiceSpec extends UnitSpec { def params: CompletionParams = val p = new CompletionParams() - p.setPosition(new Position(0, 0)) + p.setPosition(new Position(16, 0)) p "SDL text document service" should "suggest at least one autocompletion item" in { diff --git a/src/test/scala/io/smartdatalake/context/utils/MultiLineTransformerSpec.scala b/src/test/scala/io/smartdatalake/utils/MultiLineTransformerSpec.scala similarity index 86% rename from src/test/scala/io/smartdatalake/context/utils/MultiLineTransformerSpec.scala rename to src/test/scala/io/smartdatalake/utils/MultiLineTransformerSpec.scala index 9fc86f5..586d880 100644 --- a/src/test/scala/io/smartdatalake/context/utils/MultiLineTransformerSpec.scala +++ b/src/test/scala/io/smartdatalake/utils/MultiLineTransformerSpec.scala @@ -1,4 +1,4 @@ -package io.smartdatalake.context.utils +package io.smartdatalake.utils import com.typesafe.config.{Config, ConfigRenderOptions, ConfigUtil} import io.smartdatalake.UnitSpec @@ -9,11 +9,11 @@ import scala.util.Using class MultiLineTransformerSpec extends UnitSpec { - private val text: String = loadFile("fixture/with-multi-lines-example.conf") + private val text: String = loadFile("fixture/hocon/with-multi-lines-example.conf") //TODO add a fixture with mix-in flattened triple quotes and not "Multi line transformer" should "correctly flatten multi lines" in { - val expectedFlattenedText = loadFile("fixture/with-multi-lines-flattened-example.conf") + val expectedFlattenedText = loadFile("fixture/hocon/with-multi-lines-flattened-example.conf") trimLines(MLT.flattenMultiLines(text)) should be (trimLines(expectedFlattenedText)) }