diff --git a/wrangler-transform/src/e2e-test/features/Wrangler/ParseAsExcel.feature b/wrangler-transform/src/e2e-test/features/Wrangler/ParseAsExcel.feature new file mode 100644 index 000000000..3f5a1fe2b --- /dev/null +++ b/wrangler-transform/src/e2e-test/features/Wrangler/ParseAsExcel.feature @@ -0,0 +1,41 @@ +# Copyright © 2023 Cask Data, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +@Wrangler +Feature: Wrangler - Run time scenarios + + @GCS_SOURCE_TEST @BQ_SINK_TEST + Scenario: To verify User is able to run a pipeline using parse excel directive + Given Open Datafusion Project to configure pipeline + Then Click on the Plus Green Button to import the pipelines + Then Select the file for importing the pipeline for the plugin "Directive_parse_excel" + Then Navigate to the properties page of plugin: "File" + Then Replace input plugin property: "path" with value: "gcsSourceBucket" + Then Click on the Get Schema button + Then Click on the Validate button + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "BigQuery" + Then Replace input plugin property: "project" with value: "projectId" + Then Replace input plugin property: "table" with value: "bqTargetTable" + Then Replace input plugin property: "dataset" with value: "dataset" + Then Click on the Validate button + Then Close the Plugin Properties page + Then Rename the pipeline + Then Deploy the pipeline + Then Run the Pipeline in Runtime + Then Wait till pipeline is in running state + Then Open and capture logs + Then Verify the pipeline status is "Succeeded" + Then Close the pipeline logs + Then Validate The Data From BQ To BQ With Actual And Expected File for: "ExpectedDirective_parse_excel" diff --git a/wrangler-transform/src/e2e-test/features/Wrangler/ParseAsJson.feature b/wrangler-transform/src/e2e-test/features/Wrangler/ParseAsJson.feature new file mode 100644 index 000000000..c1833f2b7 --- /dev/null +++ b/wrangler-transform/src/e2e-test/features/Wrangler/ParseAsJson.feature @@ -0,0 +1,43 @@ +# Copyright © 2023 Cask Data, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +@Wrangler +Feature: Wrangler - Run time scenarios + + @BQ_SOURCE_JSON_TEST @BQ_SINK_TEST + Scenario: To verify User is able to run a pipeline using parse json directive + Given Open Datafusion Project to configure pipeline + Then Click on the Plus Green Button to import the pipelines + Then Select the file for importing the pipeline for the plugin "Directive_parse_json" + Then Navigate to the properties page of plugin: "BigQueryTable" + Then Replace input plugin property: "project" with value: "projectId" + Then Replace input plugin property: "dataset" with value: "dataset" + Then Replace input plugin property: "table" with value: "bqSourceTable" + Then Click on the Get Schema button + Then Click on the Validate button + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "BigQuery2" + Then Replace input plugin property: "project" with value: "projectId" + Then Replace input plugin property: "table" with value: "bqTargetTable" + Then Replace input plugin property: "dataset" with value: "dataset" + Then Click on the Validate button + Then Close the Plugin Properties page + Then Rename the pipeline + Then Deploy the pipeline + Then Run the Pipeline in Runtime + Then Wait till pipeline is in running state + Then Open and capture logs + Then Verify the pipeline status is "Succeeded" + Then Close the pipeline logs + Then Validate The Data From BQ To BQ With Actual And Expected File for: "ExpectedDirective_parse_json" diff --git a/wrangler-transform/src/e2e-test/features/Wrangler/ParseAsXmlToJson.feature b/wrangler-transform/src/e2e-test/features/Wrangler/ParseAsXmlToJson.feature new file mode 100644 index 000000000..665080ef4 --- /dev/null +++ b/wrangler-transform/src/e2e-test/features/Wrangler/ParseAsXmlToJson.feature @@ -0,0 +1,43 @@ +# Copyright © 2023 Cask Data, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +@Wrangler +Feature: Wrangler - Run time scenarios + + @BQ_SOURCE_XML_TEST @BQ_SINK_TEST + Scenario: To verify User is able to run a pipeline using parse XmlToJson directive + Given Open Datafusion Project to configure pipeline + Then Click on the Plus Green Button to import the pipelines + Then Select the file for importing the pipeline for the plugin "Directive_parse_xml" + Then Navigate to the properties page of plugin: "BigQueryTable" + Then Replace input plugin property: "project" with value: "projectId" + Then Replace input plugin property: "dataset" with value: "dataset" + Then Replace input plugin property: "table" with value: "bqSourceTable" + Then Click on the Get Schema button + Then Click on the Validate button + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "BigQuery2" + Then Replace input plugin property: "project" with value: "projectId" + Then Replace input plugin property: "table" with value: "bqTargetTable" + Then Replace input plugin property: "dataset" with value: "dataset" + Then Click on the Validate button + Then Close the Plugin Properties page + Then Rename the pipeline + Then Deploy the pipeline + Then Run the Pipeline in Runtime + Then Wait till pipeline is in running state + Then Open and capture logs + Then Verify the pipeline status is "Succeeded" + Then Close the pipeline logs + Then Validate The Data From BQ To BQ With Actual And Expected File for: "ExpectedDirective_parse_xml" diff --git a/wrangler-transform/src/e2e-test/java/io/cdap/plugin/common/stepsdesign/TestSetupHooks.java b/wrangler-transform/src/e2e-test/java/io/cdap/plugin/common/stepsdesign/TestSetupHooks.java index 40c60c665..3f6eb4292 100644 --- a/wrangler-transform/src/e2e-test/java/io/cdap/plugin/common/stepsdesign/TestSetupHooks.java +++ b/wrangler-transform/src/e2e-test/java/io/cdap/plugin/common/stepsdesign/TestSetupHooks.java @@ -24,7 +24,6 @@ import io.cdap.e2e.utils.StorageClient; import io.cucumber.java.After; import io.cucumber.java.Before; -import org.apache.commons.lang3.RandomStringUtils; import org.apache.commons.lang3.StringUtils; import org.junit.Assert; import stepsdesign.BeforeActions; @@ -34,16 +33,15 @@ import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Paths; -import java.sql.SQLException; import java.util.NoSuchElementException; import java.util.UUID; -import static io.cdap.e2e.pages.locators.CdfGCSLocators.filePath; - /** * Setup BQ for Wrangler tests. */ public class TestSetupHooks { + public static String gcsSourceBucketName = StringUtils.EMPTY; + @Before(order = 1, value = "@BQ_SINK_TEST") public static void setTempTargetBQTableName() { @@ -71,11 +69,56 @@ public static void deleteTempTargetBQTable() throws IOException, InterruptedExce /** * Create BigQuery table. */ - @Before(order = 1, value = "@BQ_SOURCE_CSV_TEST") - public static void createTempSourceBQTable() throws IOException, InterruptedException { - createSourceBQTableWithQueries(PluginPropertyUtils.pluginProp("CreateBQTableQueryFileCsv"), - PluginPropertyUtils.pluginProp("InsertBQDataQueryFileCsv")); + @Before(order = 1, value = "@BQ_SOURCE_JSON_TEST") + public static void createTempSourceBQTableJson() throws IOException, InterruptedException { + createSourceBQTableWithQueries(PluginPropertyUtils.pluginProp("CreateBQTableQueryFileJson"), + PluginPropertyUtils.pluginProp("InsertBQDataQueryFileJson")); + } + @Before(order = 1, value = "@BQ_SOURCE_XML_TEST") + public static void createTempSourceBQTableXml() throws IOException, InterruptedException { + createSourceBQTableWithQueries(PluginPropertyUtils.pluginProp("CreateBQDataQueryFileXml"), + PluginPropertyUtils.pluginProp("InsertBQDataQueryFileXml")); + } + @After(order = 1, value = "@BQ_SOURCE_TEST") + public static void deleteTempSourceBQTable() throws IOException, InterruptedException { + String bqSourceTable = PluginPropertyUtils.pluginProp("bqSourceTable"); + BigQueryClient.dropBqQuery(bqSourceTable); + BeforeActions.scenario.write("BQ source Table " + bqSourceTable + " deleted successfully"); + PluginPropertyUtils.removePluginProp("bqSourceTable"); } + @Before(order = 1, value = "@GCS_SOURCE_TEST") + public static void createBucketWithEXCELFile() throws IOException, URISyntaxException { + gcsSourceBucketName = createGCSBucketWithFile(PluginPropertyUtils.pluginProp("testFile")); + PluginPropertyUtils.addPluginProp("gcsSourceBucket", "gs://" + gcsSourceBucketName + "/" + + PluginPropertyUtils.pluginProp("testFile")); + BeforeActions.scenario.write("GCS source bucket1 name - " + gcsSourceBucketName); + } + private static String createGCSBucketWithFile(String filePath) throws IOException, URISyntaxException { + String bucketName = StorageClient.createBucket("e2e-test-" + UUID.randomUUID()).getName(); + StorageClient.uploadObject(bucketName, filePath, filePath); + return bucketName; + } + @After(order = 1, value = "@GCS_SOURCE_TEST") + public static void deleteSourceBucketWithFile() { + deleteGCSBucket(gcsSourceBucketName); + gcsSourceBucketName = StringUtils.EMPTY; + } + private static void deleteGCSBucket(String bucketName) { + try { + for (Blob blob : StorageClient.listObjects(bucketName).iterateAll()) { + StorageClient.deleteObject(bucketName, blob.getName()); + } + StorageClient.deleteBucket(bucketName); + BeforeActions.scenario.write("Deleted GCS Bucket " + bucketName); + } catch (StorageException | IOException e) { + if (e.getMessage().contains("The specified bucket does not exist")) { + BeforeActions.scenario.write("GCS Bucket " + bucketName + " does not exist."); + } else { + Assert.fail(e.getMessage()); + } + } + } + private static void createSourceBQTableWithQueries(String bqCreateTableQueryFile, String bqInsertDataQueryFile) throws IOException, InterruptedException { diff --git a/wrangler-transform/src/e2e-test/resources/BQValidationExpectedFiles/Directive_parse_excel b/wrangler-transform/src/e2e-test/resources/BQValidationExpectedFiles/Directive_parse_excel new file mode 100644 index 000000000..82eb3967f --- /dev/null +++ b/wrangler-transform/src/e2e-test/resources/BQValidationExpectedFiles/Directive_parse_excel @@ -0,0 +1,2 @@ +{"copiedname":"very","id":0,"name":"very","phone":"8838.0","uniquenum":"very,0"} +{"copiedname":"hello","id":2,"name":"hell","phone":"12345.0","uniquenum":"hello,2"} \ No newline at end of file diff --git a/wrangler-transform/src/e2e-test/resources/BQValidationExpectedFiles/Directive_parse_json b/wrangler-transform/src/e2e-test/resources/BQValidationExpectedFiles/Directive_parse_json new file mode 100644 index 000000000..b8cac65cb --- /dev/null +++ b/wrangler-transform/src/e2e-test/resources/BQValidationExpectedFiles/Directive_parse_json @@ -0,0 +1,3 @@ +{"Body":"hello abc","copied":{"first":"Root","last":"joy"},"desc":"nick, hello abc","id":22,"json_age":"{\"json_id\":22,\"copied\":{\"first\":\"Root\",\"last\":\"joy\"},\"json_age\":1,\"json_name\":{\"first\":\"Root\",\"last\":\"joy\"},\"json_pet\":\"testing\",\"json_id_json_name\":\"22,{\\\"first\\\":\\\"Root\\\",\\\"last\\\":\\\"joy\\\"}\",\"body\":\"hello abc\",\"desc\":\"nick, hello abc\"}","json_id_json_name":"22,{\"first\":\"Root\",\"last\":\"joy\"}","json_name":{"first":"Root","last":"joy"},"json_pet":"testing"} +{"Body":"hello def","copied":{"first":"dded","last":"share"},"desc":"hello, hello def","id":23,"json_age":"{\"json_id\":23,\"copied\":{\"first\":\"dded\",\"last\":\"share\"},\"json_age\":2,\"json_name\":{\"first\":\"dded\",\"last\":\"share\"},\"json_pet\":\"testing\",\"json_id_json_name\":\"23,{\\\"first\\\":\\\"dded\\\",\\\"last\\\":\\\"share\\\"}\",\"body\":\"hello def\",\"desc\":\"hello, hello def\"}","json_id_json_name":"23,{\"first\":\"dded\",\"last\":\"share\"}","json_name":{"first":"dded","last":"share"},"json_pet":"testing"} +{"Body":"hello ghi","copied":{"first":"Root","last":"Joltie"},"desc":"doms, hello ghi","id":24,"json_age":"{\"json_id\":24,\"copied\":{\"first\":\"Root\",\"last\":\"Joltie\"},\"json_age\":3,\"json_name\":{\"first\":\"Root\",\"last\":\"Joltie\"},\"json_pet\":\"testing\",\"json_id_json_name\":\"24,{\\\"first\\\":\\\"Root\\\",\\\"last\\\":\\\"Joltie\\\"}\",\"body\":\"hello ghi\",\"desc\":\"doms, hello ghi\"}","json_id_json_name":"24,{\"first\":\"Root\",\"last\":\"Joltie\"}","json_name":{"first":"Root","last":"Joltie"},"json_pet":"testing"} \ No newline at end of file diff --git a/wrangler-transform/src/e2e-test/resources/BQValidationExpectedFiles/Directive_parse_xmltojson b/wrangler-transform/src/e2e-test/resources/BQValidationExpectedFiles/Directive_parse_xmltojson new file mode 100644 index 000000000..2f9d3ba92 --- /dev/null +++ b/wrangler-transform/src/e2e-test/resources/BQValidationExpectedFiles/Directive_parse_xmltojson @@ -0,0 +1,6 @@ +{"Email":"abc01@mail.com","distance":2.0,"distance2":0.3571428656578064,"email_domain":{"distance":2.0,"email_account":"abc01"},"email_porter":["abc","mail","com"],"id":"1","xmldata_note":{"body":"Dont forget me this week!","from":"Tani","heading":"Reminder","to":"Tove"}} +{"Email":"def02@mail.com","distance":2.0,"distance2":0.3571428656578064,"email_domain":{"distance":2.0,"email_account":"def02"},"email_porter":["def","mail","com"],"id":"2","xmldata_note":{"body":"Dont forget us this holiday!","from":"joy","heading":"Reminder","to":"Tove"}} +{"Email":"ghi03@mail.com","distance":2.0,"distance2":0.3571428656578064,"email_domain":{"distance":2.0,"email_account":"ghi03"},"email_porter":["ghi","mail","com"],"id":"3","xmldata_note":{"body":"Dont forget him this weekend!","from":"shree","heading":"Reminder","to":"Tove"}} +{"Email":"abc01@mail.com","distance":2.0,"distance2":0.3571428656578064,"email_domain":{"distance":2.0,"email_account":"abc01"},"email_porter":["abc","mail","com"],"id":"abc","xmldata_note":{"body":"Dont forget me this week!","from":"Tani","heading":"Reminder","to":"Tove"}} +{"Email":"def02@mail.com","distance":2.0,"distance2":0.3571428656578064,"email_domain":{"distance":2.0,"email_account":"def02"},"email_porter":["def","mail","com"],"id":"def","xmldata_note":{"body":"Dont forget us this holiday!","from":"joy","heading":"Reminder","to":"Tove"}} +{"Email":"ghi03@mail.com","distance":2.0,"distance2":0.3571428656578064,"email_domain":{"distance":2.0,"email_account":"ghi03"},"email_porter":["ghi","mail","com"],"id":"ghi","xmldata_note":{"body":"Dont forget him this weekend!","from":"shree","heading":"Reminder","to":"Tove"}} \ No newline at end of file diff --git a/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/BigQueryCreateTableQueryXml.txt b/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/BigQueryCreateTableQueryXml.txt new file mode 100644 index 000000000..a711921e2 --- /dev/null +++ b/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/BigQueryCreateTableQueryXml.txt @@ -0,0 +1 @@ +create table `DATASET.TABLE_NAME` (email STRING, xmldata STRING) diff --git a/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/BigQueryInsertDataQueryXml.txt b/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/BigQueryInsertDataQueryXml.txt new file mode 100644 index 000000000..0dc9608ce --- /dev/null +++ b/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/BigQueryInsertDataQueryXml.txt @@ -0,0 +1,5 @@ +INSERT INTO DATASET.TABLE_NAME (email,xmldata) +VALUES +('abc01@mail.com',' Tove Tani Reminder Dont forget me this week! '), +('def02@mail.com',' Tove joy Reminder Dont forget us this holiday! '), +('ghi03@mail.com',' Tove shree Reminder Dont forget him this weekend! '); diff --git a/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/BigQueryInsertDataQueryparsejson.txt b/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/BigQueryInsertDataQueryparsejson.txt new file mode 100644 index 000000000..dc9fa7d17 --- /dev/null +++ b/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/BigQueryInsertDataQueryparsejson.txt @@ -0,0 +1,6 @@ +INSERT INTO DATASET.TABLE_NAME (body,json) +VALUES +(' hello abc', '{"id": 1, "name": {"first": "Root", "last": "joy"}, "age": 22, "pet": "nick", "height": 5.8}'), +('hello def', '{"id": 2, "name": {"first": "dded", "last": "share"}, "age": 23, "pet": "hello", "height": 6.8}'), +('hello ghi', '{"id": 3, "name": {"first": "Root", "last": "Joltie"}, "age": 24, "pet": "doms", "height": 7.8}'); + diff --git a/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/BigQuerycreateTableQueryjson.txt b/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/BigQuerycreateTableQueryjson.txt new file mode 100644 index 000000000..be6b585ea --- /dev/null +++ b/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/BigQuerycreateTableQueryjson.txt @@ -0,0 +1 @@ +create table `DATASET.TABLE_NAME` (body STRING, json STRING) \ No newline at end of file diff --git a/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/test1.xlsx b/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/test1.xlsx new file mode 100644 index 000000000..adaa5291b Binary files /dev/null and b/wrangler-transform/src/e2e-test/resources/BQtesdata/BigQuery/test1.xlsx differ diff --git a/wrangler-transform/src/e2e-test/resources/pluginParameters.properties b/wrangler-transform/src/e2e-test/resources/pluginParameters.properties index 8ea4bf26e..172b454d0 100644 --- a/wrangler-transform/src/e2e-test/resources/pluginParameters.properties +++ b/wrangler-transform/src/e2e-test/resources/pluginParameters.properties @@ -1,16 +1,22 @@ #json file path -Directive_parse_csv=testData/Wrangler\ - /parse_csv_wrangle-cdap-data-pipeline.json +Directive_parse_json=testData/Wrangler/parse_json_Wrangle-cdap-data-pipeline (1).json +Directive_parse_xml=testData/Wrangler/parse_xmltojson_wrangle-cdap-data-pipeline.json +Directive_parse_excel=testData/Wrangler/parse_excel_Wrangle-cdap-data-pipeline (1).json bqSourceTable=dummy sourcePath=example/hello.csv gcsSourceBucket=dummy +testFile=BQtesdata/BigQuery/test1.xlsx #bq queries file path -CreateBQTableQueryFileCsv=BQtesdata/BigQuery/BigQueryCreateTableQueryCsv.txt -InsertBQDataQueryFileCsv=BQtesdata/BigQuery/BigQueryInsertDataQueryCsv.txt +CreateBQTableQueryFileJson=BQtesdata/BigQuery/BigQuerycreateTableQueryjson.txt +InsertBQDataQueryFileJson=BQtesdata/BigQuery/BigQueryInsertDataQueryparsejson.txt +CreateBQDataQueryFileXml=BQtesdata/BigQuery/BigQueryCreateTableQueryXml.txt +InsertBQDataQueryFileXml=BQtesdata/BigQuery/BigQueryInsertDataQueryXml.txt #bq properties projectId=cdf-athena dataset=test_automation dataset2=Wrangler #expectedBQFiles -ExpectedDirective_parse_csv=BQValidationExpectedFiles/Directive_parse_csv +ExpectedDirective_parse_json=BQValidationExpectedFiles/Directive_parse_json +ExpectedDirective_parse_xml=BQValidationExpectedFiles/Directive_parse_xmltojson +ExpectedDirective_parse_excel=BQValidationExpectedFiles/Directive_parse_excel diff --git a/wrangler-transform/src/e2e-test/resources/testData/Wrangler/parse_excel_Wrangle-cdap-data-pipeline (1).json b/wrangler-transform/src/e2e-test/resources/testData/Wrangler/parse_excel_Wrangle-cdap-data-pipeline (1).json new file mode 100644 index 000000000..109a591d8 --- /dev/null +++ b/wrangler-transform/src/e2e-test/resources/testData/Wrangler/parse_excel_Wrangle-cdap-data-pipeline (1).json @@ -0,0 +1,425 @@ +{ + "name": "parse_excel_Wrangle", + "description": "Data Pipeline Application", + "artifact": { + "name": "cdap-data-pipeline", + "version": "6.10.0-SNAPSHOT", + "scope": "SYSTEM" + }, + "config": { + "resources": { + "memoryMB": 2048, + "virtualCores": 1 + }, + "driverResources": { + "memoryMB": 2048, + "virtualCores": 1 + }, + "connections": [ + { + "from": "File", + "to": "Wrangler" + }, + { + "from": "Wrangler", + "to": "BigQuery" + } + ], + "comments": [], + "postActions": [], + "properties": {}, + "processTimingEnabled": true, + "stageLoggingEnabled": false, + "stages": [ + { + "name": "File", + "plugin": { + "name": "File", + "type": "batchsource", + "label": "File", + "artifact": { + "name": "core-plugins", + "version": "2.12.0-SNAPSHOT", + "scope": "SYSTEM" + }, + "properties": { + "referenceName": "sfdsf", + "path": "gs://00000000-e2e-0014a44f-81be-4501-8360-0ddca1c39789/test1.xlsx", + "format": "blob", + "sampleSize": "1000", + "filenameOnly": "false", + "recursive": "false", + "ignoreNonExistingFolders": "false", + "fileEncoding": "UTF-8", + "schema": "{\"type\":\"record\",\"name\":\"blob\",\"fields\":[{\"name\":\"body\",\"type\":\"bytes\"}]}" + } + }, + "outputSchema": [ + { + "name": "etlSchemaBody", + "schema": "{\"type\":\"record\",\"name\":\"blob\",\"fields\":[{\"name\":\"body\",\"type\":\"bytes\"}]}" + } + ], + "id": "File", + "type": "batchsource", + "label": "File", + "icon": "icon-file", + "$$hashKey": "object:417", + "isPluginAvailable": true, + "_uiPosition": { + "left": "496px", + "top": "343px" + }, + "_backendProperties": { + "schema": { + "name": "schema", + "description": "Output schema for the source. Formats like 'avro' and 'parquet' require a schema in order to read the data.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "copyHeader": { + "name": "copyHeader", + "description": "", + "type": "boolean", + "required": false, + "macroSupported": false, + "macroEscapingEnabled": false, + "children": [] + }, + "fileEncoding": { + "name": "fileEncoding", + "description": "File encoding for the source files. The default encoding is 'UTF-8'", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "fileRegex": { + "name": "fileRegex", + "description": "Regular expression that file paths must match in order to be included in the input. The full file path is compared, not just the file name.If no value is given, no file filtering will be done. See https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html for more information about the regular expression syntax.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "format": { + "name": "format", + "description": "Format of the data to read. Supported formats are 'avro', 'blob', 'csv', 'delimited', 'json', 'parquet', 'text', or 'tsv'. ", + "type": "string", + "required": true, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "ignoreNonExistingFolders": { + "name": "ignoreNonExistingFolders", + "description": "Whether to allow an input that does not exist. When false, the source will fail the run if the input does not exist. When true, the run will not fail and the source will not generate any output. The default value is false.", + "type": "boolean", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "skipHeader": { + "name": "skipHeader", + "description": "Whether to use first row as header. Supported formats are 'text', 'csv', 'tsv', 'delimited'. Default value is false.", + "type": "boolean", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "sampleSize": { + "name": "sampleSize", + "description": "The maximum number of rows that will get investigated for automatic data type detection.", + "type": "long", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "pathField": { + "name": "pathField", + "description": "Output field to place the path of the file that the record was read from. If not specified, the file path will not be included in output records. If specified, the field must exist in the output schema as a string.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "recursive": { + "name": "recursive", + "description": "Whether to recursively read directories within the input directory. The default is false.", + "type": "boolean", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "filenameOnly": { + "name": "filenameOnly", + "description": "Whether to only use the filename instead of the URI of the file path when a path field is given. The default value is false.", + "type": "boolean", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "path": { + "name": "path", + "description": "Path to file(s) to be read. If a directory is specified, terminate the path name with a '/'. For distributed file system such as HDFS, file system name should comefrom 'fs.DefaultFS' property in the 'core-site.xml'. For example, 'hdfs://mycluster.net:8020/input', where value of the property 'fs.DefaultFS' in the 'core-site.xml' is 'hdfs://mycluster.net:8020'.", + "type": "string", + "required": true, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "maxSplitSize": { + "name": "maxSplitSize", + "description": "Maximum size of each partition used to read data. Smaller partitions will increase the level of parallelism, but will require more resources and overhead.", + "type": "long", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "delimiter": { + "name": "delimiter", + "description": "The delimiter to use if the format is 'delimited'. The delimiter will be ignored if the format is anything other than 'delimited'.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "enableQuotedValues": { + "name": "enableQuotedValues", + "description": "Whether to treat content between quotes as a value. This value will only be used if the format is 'csv', 'tsv' or 'delimited'. The default value is false.", + "type": "boolean", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "override": { + "name": "override", + "description": "A list of columns with the corresponding data types for whom the automatic data type detection gets skipped.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "fileSystemProperties": { + "name": "fileSystemProperties", + "description": "Any additional properties to use when reading from the filesystem. This is an advanced feature that requires knowledge of the properties supported by the underlying filesystem.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "referenceName": { + "name": "referenceName", + "description": "Name be used to uniquely identify this source for lineage, annotating metadata, etc.", + "type": "string", + "required": true, + "macroSupported": false, + "macroEscapingEnabled": false, + "children": [] + } + }, + "description": "Batch source for File Systems", + "selected": false + }, + { + "name": "Wrangler", + "plugin": { + "name": "Wrangler", + "type": "transform", + "label": "Wrangler", + "artifact": { + "name": "wrangler-transform", + "version": "4.10.0-SNAPSHOT", + "scope": "SYSTEM" + }, + "properties": { + "field": "*", + "precondition": "false", + "directives": "parse-as-excel :body '0' true\ncopy name copiedname\nmerge name bkd uniquenum ','\nrename bkd rollno\ndrop fwd\nswap id rollno\nsplit-to-rows :name 'o'\nfilter-rows-on condition-false rollno !~ '2.0'", + "on-error": "fail-pipeline", + "schema": "{\"type\":\"record\",\"name\":\"output\",\"fields\":[{\"name\":\"id\",\"type\":[\"int\",\"null\"]},{\"name\":\"rollno\",\"type\":[\"string\",\"null\"]},{\"name\":\"name\",\"type\":[\"string\",\"null\"]},{\"name\":\"phone\",\"type\":[\"string\",\"null\"]},{\"name\":\"copiedname\",\"type\":[\"string\",\"null\"]},{\"name\":\"uniquenum\",\"type\":[\"string\",\"null\"]}]}", + "workspaceId": "0cf0176a-5f84-41ef-9411-2b4f9c7dcfc8" + } + }, + "outputSchema": [ + { + "name": "etlSchemaBody", + "schema": "{\"type\":\"record\",\"name\":\"output\",\"fields\":[{\"name\":\"id\",\"type\":[\"int\",\"null\"]},{\"name\":\"rollno\",\"type\":[\"string\",\"null\"]},{\"name\":\"name\",\"type\":[\"string\",\"null\"]},{\"name\":\"phone\",\"type\":[\"string\",\"null\"]},{\"name\":\"copiedname\",\"type\":[\"string\",\"null\"]},{\"name\":\"uniquenum\",\"type\":[\"string\",\"null\"]}]}" + } + ], + "inputSchema": [ + { + "name": "File", + "schema": "{\"type\":\"record\",\"name\":\"blob\",\"fields\":[{\"name\":\"body\",\"type\":\"bytes\"}]}" + } + ], + "id": "Wrangler", + "type": "transform", + "label": "Wrangler", + "icon": "icon-DataPreparation", + "$$hashKey": "object:418", + "isPluginAvailable": true, + "_uiPosition": { + "left": "796px", + "top": "343px" + }, + "selected": false, + "_backendProperties": { + "schema": { + "name": "schema", + "description": "Specifies the schema that has to be output.", + "type": "string", + "required": true, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "preconditionSQL": { + "name": "preconditionSQL", + "description": "SQL Precondition expression specifying filtering before applying directives (false to filter)", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "udd": { + "name": "udd", + "description": "List of User Defined Directives (UDD) that have to be loaded.", + "type": "string", + "required": false, + "macroSupported": false, + "macroEscapingEnabled": false, + "children": [] + }, + "field": { + "name": "field", + "description": "Name of the input field to be wrangled or '*' to wrangle all the fields.", + "type": "string", + "required": true, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "on-error": { + "name": "on-error", + "description": "How to handle error in record processing", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "directives": { + "name": "directives", + "description": "Recipe for wrangling the input records", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "expressionLanguage": { + "name": "expressionLanguage", + "description": "Toggle to configure precondition language between JEXL and SQL", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "precondition": { + "name": "precondition", + "description": "JEXL Precondition expression specifying filtering before applying directives (true to filter)", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + } + }, + "description": "Wrangler - A interactive tool for data cleansing and transformation." + }, + { + "name": "BigQuery", + "plugin": { + "name": "BigQueryTable", + "type": "batchsink", + "label": "BigQuery", + "artifact": { + "name": "google-cloud", + "version": "0.23.0-SNAPSHOT", + "scope": "SYSTEM" + }, + "properties": { + "useConnection": "false", + "project": "auto-detect", + "serviceAccountType": "filePath", + "serviceFilePath": "auto-detect", + "dataset": "Wrangler", + "table": "excelupds", + "operation": "insert", + "truncateTable": "false", + "allowSchemaRelaxation": "false", + "location": "US", + "createPartitionedTable": "false", + "partitioningType": "TIME", + "partitionFilterRequired": "false", + "schema": "{\"type\":\"record\",\"name\":\"output\",\"fields\":[{\"name\":\"id\",\"type\":[\"int\",\"null\"]},{\"name\":\"name\",\"type\":[\"string\",\"null\"]},{\"name\":\"phone\",\"type\":[\"string\",\"null\"]},{\"name\":\"copiedname\",\"type\":[\"string\",\"null\"]},{\"name\":\"uniquenum\",\"type\":[\"string\",\"null\"]}]}" + } + }, + "outputSchema": [ + { + "name": "etlSchemaBody", + "schema": "{\"type\":\"record\",\"name\":\"output\",\"fields\":[{\"name\":\"id\",\"type\":[\"int\",\"null\"]},{\"name\":\"name\",\"type\":[\"string\",\"null\"]},{\"name\":\"phone\",\"type\":[\"string\",\"null\"]},{\"name\":\"copiedname\",\"type\":[\"string\",\"null\"]},{\"name\":\"uniquenum\",\"type\":[\"string\",\"null\"]}]}" + } + ], + "inputSchema": [ + { + "name": "Wrangler", + "schema": "{\"type\":\"record\",\"name\":\"output\",\"fields\":[{\"name\":\"id\",\"type\":[\"int\",\"null\"]},{\"name\":\"rollno\",\"type\":[\"string\",\"null\"]},{\"name\":\"name\",\"type\":[\"string\",\"null\"]},{\"name\":\"phone\",\"type\":[\"string\",\"null\"]},{\"name\":\"copiedname\",\"type\":[\"string\",\"null\"]},{\"name\":\"uniquenum\",\"type\":[\"string\",\"null\"]}]}" + } + ], + "id": "BigQuery", + "type": "batchsink", + "label": "BigQuery", + "icon": "fa-plug", + "$$hashKey": "object:419", + "isPluginAvailable": true, + "_uiPosition": { + "left": "1096px", + "top": "343px" + }, + "selected": false + } + ], + "schedule": "0 1 */1 * *", + "engine": "spark", + "numOfRecordsPreview": 100, + "rangeRecordsPreview": { + "min": 1, + "max": "5000" + }, + "description": "Data Pipeline Application", + "maxConcurrentRuns": 1, + "pushdownEnabled": false, + "transformationPushdown": {} + }, + "version": "5b46b464-4f37-11ee-9dbc-000000d45dd0" +} \ No newline at end of file diff --git a/wrangler-transform/src/e2e-test/resources/testData/Wrangler/parse_json_Wrangle-cdap-data-pipeline (1).json b/wrangler-transform/src/e2e-test/resources/testData/Wrangler/parse_json_Wrangle-cdap-data-pipeline (1).json new file mode 100644 index 000000000..ccc37ccef --- /dev/null +++ b/wrangler-transform/src/e2e-test/resources/testData/Wrangler/parse_json_Wrangle-cdap-data-pipeline (1).json @@ -0,0 +1,180 @@ +{ + "name": "parse_json_Wrangle", + "description": "Data Pipeline Application", + "artifact": { + "name": "cdap-data-pipeline", + "version": "6.10.0-SNAPSHOT", + "scope": "SYSTEM" + }, + "config": { + "resources": { + "memoryMB": 2048, + "virtualCores": 1 + }, + "driverResources": { + "memoryMB": 2048, + "virtualCores": 1 + }, + "connections": [ + { + "from": "BigQueryTable", + "to": "Wrangler" + }, + { + "from": "Wrangler", + "to": "BigQuery2" + } + ], + "postActions": [], + "properties": {}, + "processTimingEnabled": true, + "stageLoggingEnabled": true, + "stages": [ + { + "name": "BigQueryTable", + "plugin": { + "name": "BigQueryTable", + "type": "batchsource", + "label": "BigQueryTable", + "artifact": { + "name": "google-cloud", + "version": "0.23.0-SNAPSHOT", + "scope": "SYSTEM" + }, + "properties": { + "useConnection": "false", + "dataset": "Wrangler", + "table": "jstab", + "schema": "{\"type\":\"record\",\"name\":\"output\",\"fields\":[{\"name\":\"Body\",\"type\":\"string\"},{\"name\":\"json\",\"type\":\"string\"}]}", + "project": "auto-detect", + "serviceAccountType": "filePath", + "serviceFilePath": "auto-detect", + "enableQueryingViews": "false" + } + }, + "outputSchema": [ + { + "name": "etlSchemaBody", + "schema": "{\"type\":\"record\",\"name\":\"output\",\"fields\":[{\"name\":\"Body\",\"type\":\"string\"},{\"name\":\"json\",\"type\":\"string\"}]}" + } + ], + "id": "BigQueryTable", + "type": "batchsource", + "label": "BigQueryTable", + "icon": "fa-plug", + "$$hashKey": "object:518", + "isPluginAvailable": true, + "_uiPosition": { + "left": "496px", + "top": "327px" + } + }, + { + "name": "Wrangler", + "plugin": { + "name": "Wrangler", + "type": "transform", + "label": "Wrangler", + "artifact": { + "name": "wrangler-transform", + "version": "4.10.0-SNAPSHOT", + "scope": "SYSTEM" + }, + "properties": { + "directives": "parse-as-json :json 1\nltrim :Body\nset-column :desc concat(json_pet, \", \", body)\ncopy :json_name :copied\nswap :json_id :json_age\nmerge :json_id :json_name :json_id_json_name ,\nmask-number :json_pet 'testing'\ndrop json_height\nwrite-as-json-map :json_age\nrename json_id id", + "field": "*", + "precondition": "false", + "workspaceId": "d496b8e4-ca6f-4d38-9877-d76bb52c218e", + "schema": "{\"type\":\"record\",\"name\":\"outputSchema\",\"fields\":[{\"name\":\"Body\",\"type\":[\"string\",\"null\"]},{\"name\":\"json_age\",\"type\":[\"string\",\"null\"]},{\"name\":\"json_name\",\"type\":[{\"type\":\"record\",\"name\":\"json_name05F0DF247CD8481657781C26E1595028\",\"fields\":[{\"name\":\"first\",\"type\":[\"string\",\"null\"]},{\"name\":\"last\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"id\",\"type\":[\"long\",\"null\"]},{\"name\":\"json_pet\",\"type\":[\"string\",\"null\"]},{\"name\":\"desc\",\"type\":[\"string\",\"null\"]},{\"name\":\"copied\",\"type\":[{\"type\":\"record\",\"name\":\"copied05F0DF247CD8481657781C26E1595028\",\"fields\":[{\"name\":\"first\",\"type\":[\"string\",\"null\"]},{\"name\":\"last\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"json_id_json_name\",\"type\":[\"string\",\"null\"]}]}", + "on-error": "fail-pipeline" + } + }, + "outputSchema": [ + { + "name": "etlSchemaBody", + "schema": "{\"type\":\"record\",\"name\":\"outputSchema\",\"fields\":[{\"name\":\"Body\",\"type\":[\"string\",\"null\"]},{\"name\":\"json_age\",\"type\":[\"string\",\"null\"]},{\"name\":\"json_name\",\"type\":[{\"type\":\"record\",\"name\":\"json_name05F0DF247CD8481657781C26E1595028\",\"fields\":[{\"name\":\"first\",\"type\":[\"string\",\"null\"]},{\"name\":\"last\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"id\",\"type\":[\"long\",\"null\"]},{\"name\":\"json_pet\",\"type\":[\"string\",\"null\"]},{\"name\":\"desc\",\"type\":[\"string\",\"null\"]},{\"name\":\"copied\",\"type\":[{\"type\":\"record\",\"name\":\"copied05F0DF247CD8481657781C26E1595028\",\"fields\":[{\"name\":\"first\",\"type\":[\"string\",\"null\"]},{\"name\":\"last\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"json_id_json_name\",\"type\":[\"string\",\"null\"]}]}" + } + ], + "inputSchema": [ + { + "name": "BigQueryTable", + "schema": "{\"type\":\"record\",\"name\":\"output\",\"fields\":[{\"name\":\"Body\",\"type\":\"string\"},{\"name\":\"json\",\"type\":\"string\"}]}" + } + ], + "id": "Wrangler", + "type": "transform", + "label": "Wrangler", + "icon": "icon-DataPreparation", + "$$hashKey": "object:519", + "isPluginAvailable": true, + "_uiPosition": { + "left": "796px", + "top": "327px" + } + }, + { + "name": "BigQuery2", + "plugin": { + "name": "BigQueryTable", + "type": "batchsink", + "label": "BigQuery2", + "artifact": { + "name": "google-cloud", + "version": "0.23.0-SNAPSHOT", + "scope": "SYSTEM" + }, + "properties": { + "useConnection": "false", + "project": "auto-detect", + "serviceAccountType": "filePath", + "serviceFilePath": "auto-detect", + "dataset": "Wrangler", + "table": "jstabsupd", + "operation": "insert", + "truncateTable": "false", + "allowSchemaRelaxation": "false", + "location": "US", + "createPartitionedTable": "false", + "partitioningType": "TIME", + "partitionFilterRequired": "false", + "schema": "{\"type\":\"record\",\"name\":\"outputSchema\",\"fields\":[{\"name\":\"Body\",\"type\":[\"string\",\"null\"]},{\"name\":\"json_age\",\"type\":[\"string\",\"null\"]},{\"name\":\"json_name\",\"type\":[{\"type\":\"record\",\"name\":\"json_name05F0DF247CD8481657781C26E1595028\",\"fields\":[{\"name\":\"first\",\"type\":[\"string\",\"null\"]},{\"name\":\"last\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"id\",\"type\":[\"long\",\"null\"]},{\"name\":\"json_pet\",\"type\":[\"string\",\"null\"]},{\"name\":\"desc\",\"type\":[\"string\",\"null\"]},{\"name\":\"copied\",\"type\":[{\"type\":\"record\",\"name\":\"copied05F0DF247CD8481657781C26E1595028\",\"fields\":[{\"name\":\"first\",\"type\":[\"string\",\"null\"]},{\"name\":\"last\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"json_id_json_name\",\"type\":[\"string\",\"null\"]}]}" + } + }, + "outputSchema": [ + { + "name": "etlSchemaBody", + "schema": "{\"type\":\"record\",\"name\":\"outputSchema\",\"fields\":[{\"name\":\"Body\",\"type\":[\"string\",\"null\"]},{\"name\":\"json_age\",\"type\":[\"string\",\"null\"]},{\"name\":\"json_name\",\"type\":[{\"type\":\"record\",\"name\":\"json_name05F0DF247CD8481657781C26E1595028\",\"fields\":[{\"name\":\"first\",\"type\":[\"string\",\"null\"]},{\"name\":\"last\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"id\",\"type\":[\"long\",\"null\"]},{\"name\":\"json_pet\",\"type\":[\"string\",\"null\"]},{\"name\":\"desc\",\"type\":[\"string\",\"null\"]},{\"name\":\"copied\",\"type\":[{\"type\":\"record\",\"name\":\"copied05F0DF247CD8481657781C26E1595028\",\"fields\":[{\"name\":\"first\",\"type\":[\"string\",\"null\"]},{\"name\":\"last\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"json_id_json_name\",\"type\":[\"string\",\"null\"]}]}" + } + ], + "inputSchema": [ + { + "name": "Wrangler", + "schema": "{\"type\":\"record\",\"name\":\"outputSchema\",\"fields\":[{\"name\":\"Body\",\"type\":[\"string\",\"null\"]},{\"name\":\"json_age\",\"type\":[\"string\",\"null\"]},{\"name\":\"json_name\",\"type\":[{\"type\":\"record\",\"name\":\"json_name05F0DF247CD8481657781C26E1595028\",\"fields\":[{\"name\":\"first\",\"type\":[\"string\",\"null\"]},{\"name\":\"last\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"id\",\"type\":[\"long\",\"null\"]},{\"name\":\"json_pet\",\"type\":[\"string\",\"null\"]},{\"name\":\"desc\",\"type\":[\"string\",\"null\"]},{\"name\":\"copied\",\"type\":[{\"type\":\"record\",\"name\":\"copied05F0DF247CD8481657781C26E1595028\",\"fields\":[{\"name\":\"first\",\"type\":[\"string\",\"null\"]},{\"name\":\"last\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"json_id_json_name\",\"type\":[\"string\",\"null\"]}]}" + } + ], + "id": "BigQuery2", + "type": "batchsink", + "label": "BigQuery2", + "icon": "fa-plug", + "$$hashKey": "object:520", + "isPluginAvailable": true, + "_uiPosition": { + "left": "1096px", + "top": "327px" + } + } + ], + "schedule": "0 1 */1 * *", + "engine": "spark", + "numOfRecordsPreview": 100, + "rangeRecordsPreview": { + "min": 1, + "max": "5000" + }, + "description": "Data Pipeline Application", + "maxConcurrentRuns": 1, + "pushdownEnabled": false, + "transformationPushdown": {} + }, + "version": "516e13c0-4c05-11ee-b70a-0000001ecd5c" +} \ No newline at end of file diff --git a/wrangler-transform/src/e2e-test/resources/testData/Wrangler/parse_xmltojson_wrangle-cdap-data-pipeline.json b/wrangler-transform/src/e2e-test/resources/testData/Wrangler/parse_xmltojson_wrangle-cdap-data-pipeline.json new file mode 100644 index 000000000..044e982e8 --- /dev/null +++ b/wrangler-transform/src/e2e-test/resources/testData/Wrangler/parse_xmltojson_wrangle-cdap-data-pipeline.json @@ -0,0 +1,362 @@ +{ + "name": "parse_xmltojson_wrangle", + "description": "Data Pipeline Application", + "artifact": { + "name": "cdap-data-pipeline", + "version": "6.10.0-SNAPSHOT", + "scope": "SYSTEM" + }, + "config": { + "resources": { + "memoryMB": 2048, + "virtualCores": 1 + }, + "driverResources": { + "memoryMB": 2048, + "virtualCores": 1 + }, + "connections": [ + { + "from": "BigQueryTable", + "to": "Wrangler" + }, + { + "from": "Wrangler", + "to": "BigQuery2" + } + ], + "postActions": [], + "properties": {}, + "processTimingEnabled": true, + "stageLoggingEnabled": true, + "stages": [ + { + "name": "BigQueryTable", + "plugin": { + "name": "BigQueryTable", + "type": "batchsource", + "label": "BigQueryTable", + "artifact": { + "name": "google-cloud", + "version": "0.23.0-SNAPSHOT", + "scope": "SYSTEM" + }, + "properties": { + "useConnection": "false", + "dataset": "Wrangler", + "table": "xmltabs", + "schema": "{\"type\":\"record\",\"name\":\"output\",\"fields\":[{\"name\":\"Email\",\"type\":\"string\"},{\"name\":\"xmldata\",\"type\":\"string\"}]}", + "project": "auto-detect", + "serviceAccountType": "filePath", + "serviceFilePath": "auto-detect", + "enableQueryingViews": "false" + } + }, + "outputSchema": [ + { + "name": "etlSchemaBody", + "schema": "{\"type\":\"record\",\"name\":\"output\",\"fields\":[{\"name\":\"Email\",\"type\":\"string\"},{\"name\":\"xmldata\",\"type\":\"string\"}]}" + } + ], + "id": "BigQueryTable", + "type": "batchsource", + "label": "BigQueryTable", + "icon": "fa-plug", + "$$hashKey": "object:615", + "isPluginAvailable": true, + "_uiPosition": { + "left": "496px", + "top": "343px" + }, + "_backendProperties": { + "schema": { + "name": "schema", + "description": "The schema of the table to read.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "viewMaterializationDataset": { + "name": "viewMaterializationDataset", + "description": "The dataset in the specified project where the temporary table should be created. Defaults to the same dataset in which the table is located.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "enableQueryingViews": { + "name": "enableQueryingViews", + "description": "Whether to allow querying views. Since BigQuery views are not materialized by default, querying them may have a performance overhead.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "serviceAccountJSON": { + "name": "serviceAccountJSON", + "description": "Content of the service account file.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "partitionTo": { + "name": "partitionTo", + "description": "It's inclusive partition end date. It should be a String with format \"yyyy-MM-dd\". This value is ignored if the table does not support partitioning.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "cmekKey": { + "name": "cmekKey", + "description": "The GCP customer managed encryption key (CMEK) name used to encrypt data written to any bucket, dataset or table created by the plugin. If the bucket, dataset or table already exists, this is ignored. More information can be found at https://cloud.google.com/data-fusion/docs/how-to/customer-managed-encryption-keys", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "useConnection": { + "name": "useConnection", + "description": "Whether to use an existing connection.", + "type": "boolean", + "required": false, + "macroSupported": false, + "macroEscapingEnabled": false, + "children": [] + }, + "project": { + "name": "project", + "description": "Google Cloud Project ID. It can be found on the Dashboard in the Google Cloud Platform Console.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "viewMaterializationProject": { + "name": "viewMaterializationProject", + "description": "The project name where the temporary table should be created. Defaults to the same project in which the table is located.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "filter": { + "name": "filter", + "description": "The WHERE clause filters out rows by evaluating each row against boolean expression, and discards all rows that do not return TRUE (that is, rows that return FALSE or NULL).", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "bucket": { + "name": "bucket", + "description": "The Google Cloud Storage bucket to store temporary data in. Cloud Storage data will be deleted after it is loaded into BigQuery. If it is not provided, a unique bucket will be automatically created and then deleted after the run finishes. The service account must have permission to create buckets in the configured project.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "partitionFrom": { + "name": "partitionFrom", + "description": "It's inclusive partition start date. It should be a String with format \"yyyy-MM-dd\". This value is ignored if the table does not support partitioning.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "serviceFilePath": { + "name": "serviceFilePath", + "description": "Path on the local file system of the service account key used for authorization. Can be set to 'auto-detect' when running on a Dataproc cluster. When running on other clusters, the file must be present on every node in the cluster.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "serviceAccountType": { + "name": "serviceAccountType", + "description": "Service account type, file path where the service account is located or the JSON content of the service account.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "connection": { + "name": "connection", + "description": "The existing connection to use.", + "type": "bigqueryconnectorconfig", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [ + "serviceAccountJSON", + "serviceFilePath", + "project", + "serviceAccountType", + "datasetProject" + ] + }, + "datasetProject": { + "name": "datasetProject", + "description": "The project the dataset belongs to. This is only required if the dataset is not in the same project that the BigQuery job will run in. If no value is given, it will default to the configured project ID.", + "type": "string", + "required": false, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "dataset": { + "name": "dataset", + "description": "The dataset to write to. A dataset is contained within a specific project. Datasets are top-level containers that are used to organize and control access to tables and views.", + "type": "string", + "required": true, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "table": { + "name": "table", + "description": "The table to read from. A table contains individual records organized in rows. Each record is composed of columns (also called fields). Every table is defined by a schema that describes the column names, data types, and other information.", + "type": "string", + "required": true, + "macroSupported": true, + "macroEscapingEnabled": false, + "children": [] + }, + "referenceName": { + "name": "referenceName", + "description": "This will be used to uniquely identify this source for lineage, annotating metadata, etc.", + "type": "string", + "required": false, + "macroSupported": false, + "macroEscapingEnabled": false, + "children": [] + } + }, + "description": "This source reads the entire contents of a BigQuery table. BigQuery is Google's serverless, highly scalable, enterprise data warehouse.Data is first written to a temporary location on Google Cloud Storage, then read into the pipeline from there.", + "selected": false + }, + { + "name": "Wrangler", + "plugin": { + "name": "Wrangler", + "type": "transform", + "label": "Wrangler", + "artifact": { + "name": "wrangler-transform", + "version": "4.10.0-SNAPSHOT", + "scope": "SYSTEM" + }, + "properties": { + "directives": "parse-xml-to-json :xmldata 1\nsplit-email :email\ntext-distance block email email_account distance\ntext-metric longest-common-subsequence email email_account distance2\nwrite-as-json-object :email_domain distance,email_account\nstemming :email\nsplit-to-rows :email_account '0'\nrename :email_account id", + "field": "*", + "precondition": "false", + "workspaceId": "f076fbe0-b7f8-47bc-8cc1-2a40e04177e1", + "schema": "{\"type\":\"record\",\"name\":\"outputSchema\",\"fields\":[{\"name\":\"Email\",\"type\":[\"string\",\"null\"]},{\"name\":\"xmldata_note\",\"type\":[{\"type\":\"record\",\"name\":\"xmldata_note69A9BFB19CE40D9BB21E66FF1DCB2823\",\"fields\":[{\"name\":\"heading\",\"type\":[\"string\",\"null\"]},{\"name\":\"from\",\"type\":[\"string\",\"null\"]},{\"name\":\"to\",\"type\":[\"string\",\"null\"]},{\"name\":\"body\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"id\",\"type\":[\"string\",\"null\"]},{\"name\":\"email_domain\",\"type\":[{\"type\":\"record\",\"name\":\"email_domain53E9571E3B0C6D8ACD29805625EDE284\",\"fields\":[{\"name\":\"distance\",\"type\":[\"float\",\"null\"]},{\"name\":\"email_account\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"distance\",\"type\":[\"float\",\"null\"]},{\"name\":\"distance2\",\"type\":[\"float\",\"null\"]},{\"name\":\"email_porter\",\"type\":[{\"type\":\"array\",\"items\":[\"string\",\"null\"]},\"null\"]}]}", + "on-error": "fail-pipeline" + } + }, + "outputSchema": [ + { + "name": "etlSchemaBody", + "schema": "{\"type\":\"record\",\"name\":\"outputSchema\",\"fields\":[{\"name\":\"Email\",\"type\":[\"string\",\"null\"]},{\"name\":\"xmldata_note\",\"type\":[{\"type\":\"record\",\"name\":\"xmldata_note69A9BFB19CE40D9BB21E66FF1DCB2823\",\"fields\":[{\"name\":\"heading\",\"type\":[\"string\",\"null\"]},{\"name\":\"from\",\"type\":[\"string\",\"null\"]},{\"name\":\"to\",\"type\":[\"string\",\"null\"]},{\"name\":\"body\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"id\",\"type\":[\"string\",\"null\"]},{\"name\":\"email_domain\",\"type\":[{\"type\":\"record\",\"name\":\"email_domain53E9571E3B0C6D8ACD29805625EDE284\",\"fields\":[{\"name\":\"distance\",\"type\":[\"float\",\"null\"]},{\"name\":\"email_account\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"distance\",\"type\":[\"float\",\"null\"]},{\"name\":\"distance2\",\"type\":[\"float\",\"null\"]},{\"name\":\"email_porter\",\"type\":[{\"type\":\"array\",\"items\":[\"string\",\"null\"]},\"null\"]}]}" + } + ], + "inputSchema": [ + { + "name": "BigQueryTable", + "schema": "{\"type\":\"record\",\"name\":\"output\",\"fields\":[{\"name\":\"Email\",\"type\":\"string\"},{\"name\":\"xmldata\",\"type\":\"string\"}]}" + } + ], + "id": "Wrangler", + "type": "transform", + "label": "Wrangler", + "icon": "icon-DataPreparation", + "$$hashKey": "object:616", + "isPluginAvailable": true, + "_uiPosition": { + "left": "796px", + "top": "343px" + }, + "selected": false + }, + { + "name": "BigQuery2", + "plugin": { + "name": "BigQueryTable", + "type": "batchsink", + "label": "BigQuery2", + "artifact": { + "name": "google-cloud", + "version": "0.23.0-SNAPSHOT", + "scope": "SYSTEM" + }, + "properties": { + "useConnection": "false", + "project": "auto-detect", + "serviceAccountType": "filePath", + "serviceFilePath": "auto-detect", + "dataset": "Wrangler", + "table": "xmltabupd", + "operation": "insert", + "truncateTable": "false", + "allowSchemaRelaxation": "false", + "location": "US", + "createPartitionedTable": "false", + "partitioningType": "TIME", + "partitionFilterRequired": "false", + "schema": "{\"type\":\"record\",\"name\":\"outputSchema\",\"fields\":[{\"name\":\"Email\",\"type\":[\"string\",\"null\"]},{\"name\":\"xmldata_note\",\"type\":[{\"type\":\"record\",\"name\":\"xmldata_note69A9BFB19CE40D9BB21E66FF1DCB2823\",\"fields\":[{\"name\":\"heading\",\"type\":[\"string\",\"null\"]},{\"name\":\"from\",\"type\":[\"string\",\"null\"]},{\"name\":\"to\",\"type\":[\"string\",\"null\"]},{\"name\":\"body\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"id\",\"type\":[\"string\",\"null\"]},{\"name\":\"email_domain\",\"type\":[{\"type\":\"record\",\"name\":\"email_domain53E9571E3B0C6D8ACD29805625EDE284\",\"fields\":[{\"name\":\"distance\",\"type\":[\"float\",\"null\"]},{\"name\":\"email_account\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"distance\",\"type\":[\"float\",\"null\"]},{\"name\":\"distance2\",\"type\":[\"float\",\"null\"]},{\"name\":\"email_porter\",\"type\":[{\"type\":\"array\",\"items\":[\"string\",\"null\"]},\"null\"]}]}" + } + }, + "outputSchema": [ + { + "name": "etlSchemaBody", + "schema": "{\"type\":\"record\",\"name\":\"outputSchema\",\"fields\":[{\"name\":\"Email\",\"type\":[\"string\",\"null\"]},{\"name\":\"xmldata_note\",\"type\":[{\"type\":\"record\",\"name\":\"xmldata_note69A9BFB19CE40D9BB21E66FF1DCB2823\",\"fields\":[{\"name\":\"heading\",\"type\":[\"string\",\"null\"]},{\"name\":\"from\",\"type\":[\"string\",\"null\"]},{\"name\":\"to\",\"type\":[\"string\",\"null\"]},{\"name\":\"body\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"id\",\"type\":[\"string\",\"null\"]},{\"name\":\"email_domain\",\"type\":[{\"type\":\"record\",\"name\":\"email_domain53E9571E3B0C6D8ACD29805625EDE284\",\"fields\":[{\"name\":\"distance\",\"type\":[\"float\",\"null\"]},{\"name\":\"email_account\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"distance\",\"type\":[\"float\",\"null\"]},{\"name\":\"distance2\",\"type\":[\"float\",\"null\"]},{\"name\":\"email_porter\",\"type\":[{\"type\":\"array\",\"items\":[\"string\",\"null\"]},\"null\"]}]}" + } + ], + "inputSchema": [ + { + "name": "Wrangler", + "schema": "{\"type\":\"record\",\"name\":\"outputSchema\",\"fields\":[{\"name\":\"Email\",\"type\":[\"string\",\"null\"]},{\"name\":\"xmldata_note\",\"type\":[{\"type\":\"record\",\"name\":\"xmldata_note69A9BFB19CE40D9BB21E66FF1DCB2823\",\"fields\":[{\"name\":\"heading\",\"type\":[\"string\",\"null\"]},{\"name\":\"from\",\"type\":[\"string\",\"null\"]},{\"name\":\"to\",\"type\":[\"string\",\"null\"]},{\"name\":\"body\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"id\",\"type\":[\"string\",\"null\"]},{\"name\":\"email_domain\",\"type\":[{\"type\":\"record\",\"name\":\"email_domain53E9571E3B0C6D8ACD29805625EDE284\",\"fields\":[{\"name\":\"distance\",\"type\":[\"float\",\"null\"]},{\"name\":\"email_account\",\"type\":[\"string\",\"null\"]}]},\"null\"]},{\"name\":\"distance\",\"type\":[\"float\",\"null\"]},{\"name\":\"distance2\",\"type\":[\"float\",\"null\"]},{\"name\":\"email_porter\",\"type\":[{\"type\":\"array\",\"items\":[\"string\",\"null\"]},\"null\"]}]}" + } + ], + "id": "BigQuery2", + "type": "batchsink", + "label": "BigQuery2", + "icon": "fa-plug", + "$$hashKey": "object:617", + "isPluginAvailable": true, + "_uiPosition": { + "left": "1096px", + "top": "343px" + }, + "selected": false + } + ], + "schedule": "0 1 */1 * *", + "engine": "spark", + "numOfRecordsPreview": 100, + "rangeRecordsPreview": { + "min": 1, + "max": "5000" + }, + "maxConcurrentRuns": 1, + "pushdownEnabled": false, + "transformationPushdown": {} + }, + "version": "fe20077f-4e31-11ee-b01d-000000d34db8" +} \ No newline at end of file