From 9892ff3fc122ca3e38ef0ad51d4fcfa69dd7c235 Mon Sep 17 00:00:00 2001
From: Jialiang Tan <jacob.jialiang.tan@gmail.com>
Date: Tue, 24 Sep 2024 19:46:20 -0700
Subject: [PATCH] Deprecate transfer capacity for shared arbitrator

---
 velox/common/memory/SharedArbitrator.h        |    2 +-
 .../memory/tests/MemoryArbitratorTest.cpp     |   11 +-
 .../memory/tests/MockSharedArbitratorTest.cpp |    2 +-
 velox/exec/tests/utils/ArbitratorTestUtil.cpp |    2 -
 .../fuzzer/proto/spark/connect/base.proto     |  817 ++++++++++++++
 .../fuzzer/proto/spark/connect/catalog.proto  |  243 ++++
 .../fuzzer/proto/spark/connect/commands.proto |  416 +++++++
 .../fuzzer/proto/spark/connect/common.proto   |   48 +
 .../proto/spark/connect/example_plugins.proto |   42 +
 .../proto/spark/connect/expressions.proto     |  382 +++++++
 .../proto/spark/connect/relations.proto       | 1003 +++++++++++++++++
 .../fuzzer/proto/spark/connect/types.proto    |  195 ++++
 12 files changed, 3154 insertions(+), 9 deletions(-)
 create mode 100644 velox/functions/sparksql/fuzzer/proto/spark/connect/base.proto
 create mode 100644 velox/functions/sparksql/fuzzer/proto/spark/connect/catalog.proto
 create mode 100644 velox/functions/sparksql/fuzzer/proto/spark/connect/commands.proto
 create mode 100644 velox/functions/sparksql/fuzzer/proto/spark/connect/common.proto
 create mode 100644 velox/functions/sparksql/fuzzer/proto/spark/connect/example_plugins.proto
 create mode 100644 velox/functions/sparksql/fuzzer/proto/spark/connect/expressions.proto
 create mode 100644 velox/functions/sparksql/fuzzer/proto/spark/connect/relations.proto
 create mode 100644 velox/functions/sparksql/fuzzer/proto/spark/connect/types.proto

diff --git a/velox/common/memory/SharedArbitrator.h b/velox/common/memory/SharedArbitrator.h
index 6c784f5373f29..6ca0ef513530b 100644
--- a/velox/common/memory/SharedArbitrator.h
+++ b/velox/common/memory/SharedArbitrator.h
@@ -66,7 +66,7 @@ class SharedArbitrator : public memory::MemoryArbitrator {
     static constexpr std::string_view kMemoryPoolTransferCapacity{
         "memory-pool-transfer-capacity"};
     static constexpr std::string_view kDefaultMemoryPoolTransferCapacity{
-        "128MB"};
+        "0B"};
     static uint64_t getMemoryPoolTransferCapacity(
         const std::unordered_map<std::string, std::string>& configs);
 
diff --git a/velox/common/memory/tests/MemoryArbitratorTest.cpp b/velox/common/memory/tests/MemoryArbitratorTest.cpp
index 61a956d7dde62..2e71aa4390a05 100644
--- a/velox/common/memory/tests/MemoryArbitratorTest.cpp
+++ b/velox/common/memory/tests/MemoryArbitratorTest.cpp
@@ -144,6 +144,7 @@ TEST_F(MemoryArbitrationTest, queryMemoryCapacity) {
     ASSERT_FALSE(manager.arbitrator()->growCapacity(rootPool.get(), 6 << 20));
     ASSERT_EQ(rootPool->capacity(), 1 << 20);
     ASSERT_TRUE(manager.arbitrator()->growCapacity(rootPool.get(), 2 << 20));
+    ASSERT_TRUE(manager.arbitrator()->growCapacity(rootPool.get(), 1 << 20));
     ASSERT_EQ(rootPool->capacity(), 4 << 20);
     ASSERT_EQ(manager.arbitrator()->stats().freeCapacityBytes, 2 << 20);
     ASSERT_EQ(manager.arbitrator()->stats().freeReservedCapacityBytes, 2 << 20);
@@ -154,19 +155,19 @@ TEST_F(MemoryArbitrationTest, queryMemoryCapacity) {
         "Exceeded memory pool capacity after attempt to grow capacity through "
         "arbitration. Requestor pool name 'leaf-1.0', request size 7.00MB, "
         "memory pool capacity 4.00MB, memory pool max capacity 8.00MB");
-    ASSERT_EQ(manager.arbitrator()->shrinkCapacity(rootPool.get(), 0), 1 << 20);
+    ASSERT_EQ(manager.arbitrator()->shrinkCapacity(rootPool.get(), 0), 0);
     ASSERT_EQ(manager.arbitrator()->shrinkCapacity(leafPool.get(), 0), 0);
     ASSERT_EQ(manager.arbitrator()->shrinkCapacity(leafPool.get(), 1), 0);
     ASSERT_EQ(manager.arbitrator()->shrinkCapacity(rootPool.get(), 1), 0);
-    ASSERT_EQ(rootPool->capacity(), 3 << 20);
+    ASSERT_EQ(rootPool->capacity(), 4 << 20);
     static_cast<MemoryPoolImpl*>(rootPool.get())->testingSetReservation(0);
     ASSERT_EQ(
         manager.arbitrator()->shrinkCapacity(leafPool.get(), 1 << 20), 1 << 20);
     ASSERT_EQ(
         manager.arbitrator()->shrinkCapacity(rootPool.get(), 1 << 20), 1 << 20);
-    ASSERT_EQ(rootPool->capacity(), 1 << 20);
-    ASSERT_EQ(leafPool->capacity(), 1 << 20);
-    ASSERT_EQ(manager.arbitrator()->shrinkCapacity(leafPool.get(), 0), 1 << 20);
+    ASSERT_EQ(rootPool->capacity(), 2 << 20);
+    ASSERT_EQ(leafPool->capacity(), 2 << 20);
+    ASSERT_EQ(manager.arbitrator()->shrinkCapacity(leafPool.get(), 0), 2 << 20);
     ASSERT_EQ(rootPool->capacity(), 0);
     ASSERT_EQ(leafPool->capacity(), 0);
   }
diff --git a/velox/common/memory/tests/MockSharedArbitratorTest.cpp b/velox/common/memory/tests/MockSharedArbitratorTest.cpp
index cc32ef111791c..2f31092fe8e4c 100644
--- a/velox/common/memory/tests/MockSharedArbitratorTest.cpp
+++ b/velox/common/memory/tests/MockSharedArbitratorTest.cpp
@@ -558,7 +558,7 @@ TEST_F(MockSharedArbitrationTest, extraConfigs) {
   ASSERT_EQ(
       SharedArbitrator::ExtraConfig::getMemoryPoolTransferCapacity(
           emptyConfigs),
-      128 << 20);
+      0);
   ASSERT_EQ(
       SharedArbitrator::ExtraConfig::getMemoryReclaimMaxWaitTimeMs(
           emptyConfigs),
diff --git a/velox/exec/tests/utils/ArbitratorTestUtil.cpp b/velox/exec/tests/utils/ArbitratorTestUtil.cpp
index da19a289eb2cc..0630c36a19ae7 100644
--- a/velox/exec/tests/utils/ArbitratorTestUtil.cpp
+++ b/velox/exec/tests/utils/ArbitratorTestUtil.cpp
@@ -59,8 +59,6 @@ std::unique_ptr<memory::MemoryManager> createMemoryManager(
   options.extraArbitratorConfigs = {
       {std::string(ExtraConfig::kMemoryPoolInitialCapacity),
        folly::to<std::string>(memoryPoolInitCapacity) + "B"},
-      {std::string(ExtraConfig::kMemoryPoolTransferCapacity),
-       folly::to<std::string>(memoryPoolTransferCapacity) + "B"},
       {std::string(ExtraConfig::kMemoryReclaimMaxWaitTime),
        folly::to<std::string>(maxReclaimWaitMs) + "ms"},
       {std::string(ExtraConfig::kGlobalArbitrationEnabled), "true"},
diff --git a/velox/functions/sparksql/fuzzer/proto/spark/connect/base.proto b/velox/functions/sparksql/fuzzer/proto/spark/connect/base.proto
new file mode 100644
index 0000000000000..65e2493f83687
--- /dev/null
+++ b/velox/functions/sparksql/fuzzer/proto/spark/connect/base.proto
@@ -0,0 +1,817 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = 'proto3';
+
+package spark.connect;
+
+import "google/protobuf/any.proto";
+import "spark/connect/commands.proto";
+import "spark/connect/common.proto";
+import "spark/connect/expressions.proto";
+import "spark/connect/relations.proto";
+import "spark/connect/types.proto";
+
+option java_multiple_files = true;
+option java_package = "org.apache.spark.connect.proto";
+option go_package = "internal/generated";
+
+// A [[Plan]] is the structure that carries the runtime information for the execution from the
+// client to the server. A [[Plan]] can either be of the type [[Relation]] which is a reference
+// to the underlying logical plan or it can be of the [[Command]] type that is used to execute
+// commands on the server.
+message Plan {
+  oneof op_type {
+    Relation root = 1;
+    Command command = 2;
+  }
+}
+
+
+
+// User Context is used to refer to one particular user session that is executing
+// queries in the backend.
+message UserContext {
+  string user_id = 1;
+  string user_name = 2;
+
+  // To extend the existing user context message that is used to identify incoming requests,
+  // Spark Connect leverages the Any protobuf type that can be used to inject arbitrary other
+  // messages into this message. Extensions are stored as a `repeated` type to be able to
+  // handle multiple active extensions.
+  repeated google.protobuf.Any extensions = 999;
+}
+
+// Request to perform plan analyze, optionally to explain the plan.
+message AnalyzePlanRequest {
+  // (Required)
+  //
+  // The session_id specifies a spark session for a user id (which is specified
+  // by user_context.user_id). The session_id is set by the client to be able to
+  // collate streaming responses from different queries within the dedicated session.
+  // The id should be an UUID string of the format `00112233-4455-6677-8899-aabbccddeeff`
+  string session_id = 1;
+
+  // (Required) User context
+  UserContext user_context = 2;
+
+  // Provides optional information about the client sending the request. This field
+  // can be used for language or version specific information and is only intended for
+  // logging purposes and will not be interpreted by the server.
+  optional string client_type = 3;
+
+  oneof analyze {
+    Schema schema = 4;
+    Explain explain = 5;
+    TreeString tree_string = 6;
+    IsLocal is_local = 7;
+    IsStreaming is_streaming = 8;
+    InputFiles input_files = 9;
+    SparkVersion spark_version = 10;
+    DDLParse ddl_parse = 11;
+    SameSemantics same_semantics = 12;
+    SemanticHash semantic_hash = 13;
+    Persist persist = 14;
+    Unpersist unpersist = 15;
+    GetStorageLevel get_storage_level = 16;
+  }
+
+  message Schema {
+    // (Required) The logical plan to be analyzed.
+    Plan plan = 1;
+  }
+
+  // Explains the input plan based on a configurable mode.
+  message Explain {
+    // (Required) The logical plan to be analyzed.
+    Plan plan = 1;
+
+    // (Required) For analyzePlan rpc calls, configure the mode to explain plan in strings.
+    ExplainMode explain_mode = 2;
+
+    // Plan explanation mode.
+    enum ExplainMode {
+      EXPLAIN_MODE_UNSPECIFIED = 0;
+
+      // Generates only physical plan.
+      EXPLAIN_MODE_SIMPLE = 1;
+
+      // Generates parsed logical plan, analyzed logical plan, optimized logical plan and physical plan.
+      // Parsed Logical plan is a unresolved plan that extracted from the query. Analyzed logical plans
+      // transforms which translates unresolvedAttribute and unresolvedRelation into fully typed objects.
+      // The optimized logical plan transforms through a set of optimization rules, resulting in the
+      // physical plan.
+      EXPLAIN_MODE_EXTENDED = 2;
+
+      // Generates code for the statement, if any and a physical plan.
+      EXPLAIN_MODE_CODEGEN = 3;
+
+      // If plan node statistics are available, generates a logical plan and also the statistics.
+      EXPLAIN_MODE_COST = 4;
+
+      // Generates a physical plan outline and also node details.
+      EXPLAIN_MODE_FORMATTED = 5;
+    }
+  }
+
+  message TreeString {
+    // (Required) The logical plan to be analyzed.
+    Plan plan = 1;
+
+    // (Optional) Max level of the schema.
+    optional int32 level = 2;
+  }
+
+  message IsLocal {
+    // (Required) The logical plan to be analyzed.
+    Plan plan = 1;
+  }
+
+  message IsStreaming {
+    // (Required) The logical plan to be analyzed.
+    Plan plan = 1;
+  }
+
+  message InputFiles {
+    // (Required) The logical plan to be analyzed.
+    Plan plan = 1;
+  }
+
+  message SparkVersion { }
+
+  message DDLParse {
+    // (Required) The DDL formatted string to be parsed.
+    string ddl_string = 1;
+  }
+
+
+  // Returns `true` when the logical query plans  are equal and therefore return same results.
+  message SameSemantics {
+    // (Required) The plan to be compared.
+    Plan target_plan = 1;
+
+    // (Required) The other plan to be compared.
+    Plan other_plan = 2;
+  }
+
+  message SemanticHash {
+    // (Required) The logical plan to get a hashCode.
+    Plan plan = 1;
+  }
+
+  message Persist {
+    // (Required) The logical plan to persist.
+    Relation relation = 1;
+
+    // (Optional) The storage level.
+    optional StorageLevel storage_level = 2;
+  }
+
+  message Unpersist {
+    // (Required) The logical plan to unpersist.
+    Relation relation = 1;
+
+    // (Optional) Whether to block until all blocks are deleted.
+    optional bool blocking = 2;
+  }
+
+  message GetStorageLevel {
+    // (Required) The logical plan to get the storage level.
+    Relation relation = 1;
+  }
+}
+
+// Response to performing analysis of the query. Contains relevant metadata to be able to
+// reason about the performance.
+message AnalyzePlanResponse {
+  string session_id = 1;
+
+  oneof result {
+    Schema schema = 2;
+    Explain explain = 3;
+    TreeString tree_string = 4;
+    IsLocal is_local = 5;
+    IsStreaming is_streaming = 6;
+    InputFiles input_files = 7;
+    SparkVersion spark_version = 8;
+    DDLParse ddl_parse = 9;
+    SameSemantics same_semantics = 10;
+    SemanticHash semantic_hash = 11;
+    Persist persist = 12;
+    Unpersist unpersist = 13;
+    GetStorageLevel get_storage_level = 14;
+  }
+
+  message Schema {
+    DataType schema = 1;
+  }
+
+  message Explain {
+    string explain_string = 1;
+  }
+
+  message TreeString {
+    string tree_string = 1;
+  }
+
+  message IsLocal {
+    bool is_local = 1;
+  }
+
+  message IsStreaming {
+    bool is_streaming = 1;
+  }
+
+  message InputFiles {
+    // A best-effort snapshot of the files that compose this Dataset
+    repeated string files = 1;
+  }
+
+  message SparkVersion {
+    string version = 1;
+  }
+
+  message DDLParse {
+    DataType parsed = 1;
+  }
+
+  message SameSemantics {
+    bool result = 1;
+  }
+
+  message SemanticHash {
+    int32 result = 1;
+  }
+
+  message Persist { }
+
+  message Unpersist { }
+
+  message GetStorageLevel {
+    // (Required) The StorageLevel as a result of get_storage_level request.
+    StorageLevel storage_level = 1;
+  }
+}
+
+// A request to be executed by the service.
+message ExecutePlanRequest {
+  // (Required)
+  //
+  // The session_id specifies a spark session for a user id (which is specified
+  // by user_context.user_id). The session_id is set by the client to be able to
+  // collate streaming responses from different queries within the dedicated session.
+  // The id should be an UUID string of the format `00112233-4455-6677-8899-aabbccddeeff`
+  string session_id = 1;
+
+  // (Required) User context
+  //
+  // user_context.user_id and session+id both identify a unique remote spark session on the
+  // server side.
+  UserContext user_context = 2;
+
+  // (Optional)
+  // Provide an id for this request. If not provided, it will be generated by the server.
+  // It is returned in every ExecutePlanResponse.operation_id of the ExecutePlan response stream.
+  // The id must be an UUID string of the format `00112233-4455-6677-8899-aabbccddeeff`
+  optional string operation_id = 6;
+
+  // (Required) The logical plan to be executed / analyzed.
+  Plan plan = 3;
+
+  // Provides optional information about the client sending the request. This field
+  // can be used for language or version specific information and is only intended for
+  // logging purposes and will not be interpreted by the server.
+  optional string client_type = 4;
+
+  // Repeated element for options that can be passed to the request. This element is currently
+  // unused but allows to pass in an extension value used for arbitrary options.
+  repeated RequestOption request_options = 5;
+
+  message RequestOption {
+    oneof request_option {
+      ReattachOptions reattach_options = 1;
+      // Extension type for request options
+      google.protobuf.Any extension = 999;
+    }
+  }
+
+  // Tags to tag the given execution with.
+  // Tags cannot contain ',' character and cannot be empty strings.
+  // Used by Interrupt with interrupt.tag.
+  repeated string tags = 7;
+}
+
+// The response of a query, can be one or more for each request. Responses belonging to the
+// same input query, carry the same `session_id`.
+message ExecutePlanResponse {
+  string session_id = 1;
+
+  // Identifies the ExecutePlan execution.
+  // If set by the client in ExecutePlanRequest.operationId, that value is returned.
+  // Otherwise generated by the server.
+  // It is an UUID string of the format `00112233-4455-6677-8899-aabbccddeeff`
+  string operation_id = 12;
+
+  // Identified the response in the stream.
+  // The id is an UUID string of the format `00112233-4455-6677-8899-aabbccddeeff`
+  string response_id = 13;
+
+  // Union type for the different response messages.
+  oneof response_type {
+    ArrowBatch arrow_batch = 2;
+
+    // Special case for executing SQL commands.
+    SqlCommandResult sql_command_result = 5;
+
+    // Response for a streaming query.
+    WriteStreamOperationStartResult write_stream_operation_start_result = 8;
+
+    // Response for commands on a streaming query.
+    StreamingQueryCommandResult streaming_query_command_result = 9;
+
+    // Response for 'SparkContext.resources'.
+    GetResourcesCommandResult get_resources_command_result = 10;
+
+    // Response for commands on the streaming query manager.
+    StreamingQueryManagerCommandResult streaming_query_manager_command_result = 11;
+
+    // Response type informing if the stream is complete in reattachable execution.
+    ResultComplete result_complete = 14;
+
+    // Support arbitrary result objects.
+    google.protobuf.Any extension = 999;
+  }
+
+  // Metrics for the query execution. Typically, this field is only present in the last
+  // batch of results and then represent the overall state of the query execution.
+  Metrics metrics = 4;
+
+  // The metrics observed during the execution of the query plan.
+  repeated ObservedMetrics observed_metrics = 6;
+
+  // (Optional) The Spark schema. This field is available when `collect` is called.
+  DataType schema = 7;
+
+  // A SQL command returns an opaque Relation that can be directly used as input for the next
+  // call.
+  message SqlCommandResult {
+    Relation relation = 1;
+  }
+
+  // Batch results of metrics.
+  message ArrowBatch {
+    int64 row_count = 1;
+    bytes data = 2;
+  }
+
+  message Metrics {
+
+    repeated MetricObject metrics = 1;
+
+    message MetricObject {
+      string name = 1;
+      int64 plan_id = 2;
+      int64 parent = 3;
+      map<string, MetricValue> execution_metrics = 4;
+    }
+
+    message MetricValue {
+      string name = 1;
+      int64 value = 2;
+      string metric_type = 3;
+    }
+  }
+
+  message ObservedMetrics {
+    string name = 1;
+    repeated Expression.Literal values = 2;
+  }
+
+  message ResultComplete {
+    // If present, in a reattachable execution this means that after server sends onComplete,
+    // the execution is complete. If the server sends onComplete without sending a ResultComplete,
+    // it means that there is more, and the client should use ReattachExecute RPC to continue.
+  }
+}
+
+// The key-value pair for the config request and response.
+message KeyValue {
+  // (Required) The key.
+  string key = 1;
+  // (Optional) The value.
+  optional string value = 2;
+}
+
+// Request to update or fetch the configurations.
+message ConfigRequest {
+  // (Required)
+  //
+  // The session_id specifies a spark session for a user id (which is specified
+  // by user_context.user_id). The session_id is set by the client to be able to
+  // collate streaming responses from different queries within the dedicated session.
+  // The id should be an UUID string of the format `00112233-4455-6677-8899-aabbccddeeff`
+  string session_id = 1;
+
+  // (Required) User context
+  UserContext user_context = 2;
+
+  // (Required) The operation for the config.
+  Operation operation = 3;
+
+  // Provides optional information about the client sending the request. This field
+  // can be used for language or version specific information and is only intended for
+  // logging purposes and will not be interpreted by the server.
+  optional string client_type = 4;
+
+  message Operation {
+    oneof op_type {
+      Set set = 1;
+      Get get = 2;
+      GetWithDefault get_with_default = 3;
+      GetOption get_option = 4;
+      GetAll get_all = 5;
+      Unset unset = 6;
+      IsModifiable is_modifiable = 7;
+    }
+  }
+
+  message Set {
+    // (Required) The config key-value pairs to set.
+    repeated KeyValue pairs = 1;
+  }
+
+  message Get {
+    // (Required) The config keys to get.
+    repeated string keys = 1;
+  }
+
+  message GetWithDefault {
+    // (Required) The config key-value paris to get. The value will be used as the default value.
+    repeated KeyValue pairs = 1;
+  }
+
+  message GetOption {
+    // (Required) The config keys to get optionally.
+    repeated string keys = 1;
+  }
+
+  message GetAll {
+    // (Optional) The prefix of the config key to get.
+    optional string prefix = 1;
+  }
+
+  message Unset {
+    // (Required) The config keys to unset.
+    repeated string keys = 1;
+  }
+
+  message IsModifiable {
+    // (Required) The config keys to check the config is modifiable.
+    repeated string keys = 1;
+  }
+}
+
+// Response to the config request.
+message ConfigResponse {
+  string session_id = 1;
+
+  // (Optional) The result key-value pairs.
+  //
+  // Available when the operation is 'Get', 'GetWithDefault', 'GetOption', 'GetAll'.
+  // Also available for the operation 'IsModifiable' with boolean string "true" and "false".
+  repeated KeyValue pairs = 2;
+
+  // (Optional)
+  //
+  // Warning messages for deprecated or unsupported configurations.
+  repeated string warnings = 3;
+}
+
+// Request to transfer client-local artifacts.
+message AddArtifactsRequest {
+
+  // (Required)
+  //
+  // The session_id specifies a spark session for a user id (which is specified
+  // by user_context.user_id). The session_id is set by the client to be able to
+  // collate streaming responses from different queries within the dedicated session.
+  // The id should be an UUID string of the format `00112233-4455-6677-8899-aabbccddeeff`
+  string session_id = 1;
+
+  // User context
+  UserContext user_context = 2;
+
+  // Provides optional information about the client sending the request. This field
+  // can be used for language or version specific information and is only intended for
+  // logging purposes and will not be interpreted by the server.
+  optional string client_type = 6;
+
+  // A chunk of an Artifact.
+  message ArtifactChunk {
+    // Data chunk.
+    bytes data = 1;
+    // CRC to allow server to verify integrity of the chunk.
+    int64 crc = 2;
+  }
+
+  // An artifact that is contained in a single `ArtifactChunk`.
+  // Generally, this message represents tiny artifacts such as REPL-generated class files.
+  message SingleChunkArtifact {
+    // The name of the artifact is expected in the form of a "Relative Path" that is made up of a
+    // sequence of directories and the final file element.
+    // Examples of "Relative Path"s: "jars/test.jar", "classes/xyz.class", "abc.xyz", "a/b/X.jar".
+    // The server is expected to maintain the hierarchy of files as defined by their name. (i.e
+    // The relative path of the file on the server's filesystem will be the same as the name of
+    // the provided artifact)
+    string name = 1;
+    // A single data chunk.
+    ArtifactChunk data = 2;
+  }
+
+  // A number of `SingleChunkArtifact` batched into a single RPC.
+  message Batch {
+    repeated SingleChunkArtifact artifacts = 1;
+  }
+
+  // Signals the beginning/start of a chunked artifact.
+  // A large artifact is transferred through a payload of `BeginChunkedArtifact` followed by a
+  // sequence of `ArtifactChunk`s.
+  message BeginChunkedArtifact {
+    // Name of the artifact undergoing chunking. Follows the same conventions as the `name` in
+    // the `Artifact` message.
+    string name = 1;
+    // Total size of the artifact in bytes.
+    int64 total_bytes = 2;
+    // Number of chunks the artifact is split into.
+    // This includes the `initial_chunk`.
+    int64 num_chunks = 3;
+    // The first/initial chunk.
+    ArtifactChunk initial_chunk = 4;
+  }
+
+  // The payload is either a batch of artifacts or a partial chunk of a large artifact.
+  oneof payload {
+    Batch batch = 3;
+    // The metadata and the initial chunk of a large artifact chunked into multiple requests.
+    // The server side is notified about the total size of the large artifact as well as the
+    // number of chunks to expect.
+    BeginChunkedArtifact begin_chunk = 4;
+    // A chunk of an artifact excluding metadata. This can be any chunk of a large artifact
+    // excluding the first chunk (which is included in `BeginChunkedArtifact`).
+    ArtifactChunk chunk = 5;
+  }
+}
+
+// Response to adding an artifact. Contains relevant metadata to verify successful transfer of
+// artifact(s).
+message AddArtifactsResponse {
+  // Metadata of an artifact.
+  message ArtifactSummary {
+    string name = 1;
+    // Whether the CRC (Cyclic Redundancy Check) is successful on server verification.
+    // The server discards any artifact that fails the CRC.
+    // If false, the client may choose to resend the artifact specified by `name`.
+    bool is_crc_successful = 2;
+  }
+
+  // The list of artifact(s) seen by the server.
+  repeated ArtifactSummary artifacts = 1;
+}
+
+// Request to get current statuses of artifacts at the server side.
+message ArtifactStatusesRequest {
+  // (Required)
+  //
+  // The session_id specifies a spark session for a user id (which is specified
+  // by user_context.user_id). The session_id is set by the client to be able to
+  // collate streaming responses from different queries within the dedicated session.
+  // The id should be an UUID string of the format `00112233-4455-6677-8899-aabbccddeeff`
+  string session_id = 1;
+
+  // User context
+  UserContext user_context = 2;
+
+  // Provides optional information about the client sending the request. This field
+  // can be used for language or version specific information and is only intended for
+  // logging purposes and will not be interpreted by the server.
+  optional string client_type = 3;
+
+  // The name of the artifact is expected in the form of a "Relative Path" that is made up of a
+  // sequence of directories and the final file element.
+  // Examples of "Relative Path"s: "jars/test.jar", "classes/xyz.class", "abc.xyz", "a/b/X.jar".
+  // The server is expected to maintain the hierarchy of files as defined by their name. (i.e
+  // The relative path of the file on the server's filesystem will be the same as the name of
+  // the provided artifact)
+  repeated string names = 4;
+}
+
+// Response to checking artifact statuses.
+message ArtifactStatusesResponse {
+  message ArtifactStatus {
+    // Exists or not particular artifact at the server.
+    bool exists = 1;
+  }
+
+  // A map of artifact names to their statuses.
+  map<string, ArtifactStatus> statuses = 1;
+}
+
+message InterruptRequest {
+  // (Required)
+  //
+  // The session_id specifies a spark session for a user id (which is specified
+  // by user_context.user_id). The session_id is set by the client to be able to
+  // collate streaming responses from different queries within the dedicated session.
+  // The id should be an UUID string of the format `00112233-4455-6677-8899-aabbccddeeff`
+  string session_id = 1;
+
+  // (Required) User context
+  UserContext user_context = 2;
+
+  // Provides optional information about the client sending the request. This field
+  // can be used for language or version specific information and is only intended for
+  // logging purposes and will not be interpreted by the server.
+  optional string client_type = 3;
+
+  // (Required) The type of interrupt to execute.
+  InterruptType interrupt_type = 4;
+
+  enum InterruptType {
+    INTERRUPT_TYPE_UNSPECIFIED = 0;
+
+    // Interrupt all running executions within the session with the provided session_id.
+    INTERRUPT_TYPE_ALL = 1;
+
+    // Interrupt all running executions within the session with the provided operation_tag.
+    INTERRUPT_TYPE_TAG = 2;
+
+    // Interrupt the running execution within the session with the provided operation_id.
+    INTERRUPT_TYPE_OPERATION_ID = 3;
+  }
+
+  oneof interrupt {
+    // if interrupt_tag == INTERRUPT_TYPE_TAG, interrupt operation with this tag.
+    string operation_tag = 5;
+
+    // if interrupt_tag == INTERRUPT_TYPE_OPERATION_ID, interrupt operation with this operation_id.
+    string operation_id = 6;
+  }
+}
+
+message InterruptResponse {
+  // Session id in which the interrupt was running.
+  string session_id = 1;
+
+  // Operation ids of the executions which were interrupted.
+  repeated string interrupted_ids = 2;
+}
+
+message ReattachOptions {
+  // If true, the request can be reattached to using ReattachExecute.
+  // ReattachExecute can be used either if the stream broke with a GRPC network error,
+  // or if the server closed the stream without sending a response with StreamStatus.complete=true.
+  // The server will keep a buffer of responses in case a response is lost, and
+  // ReattachExecute needs to back-track.
+  //
+  // If false, the execution response stream will will not be reattachable, and all responses are
+  // immediately released by the server after being sent.
+  bool reattachable = 1;
+}
+
+message ReattachExecuteRequest {
+  // (Required)
+  //
+  // The session_id of the request to reattach to.
+  // This must be an id of existing session.
+  string session_id = 1;
+
+  // (Required) User context
+  //
+  // user_context.user_id and session+id both identify a unique remote spark session on the
+  // server side.
+  UserContext user_context = 2;
+
+  // (Required)
+  // Provide an id of the request to reattach to.
+  // This must be an id of existing operation.
+  string operation_id = 3;
+
+  // Provides optional information about the client sending the request. This field
+  // can be used for language or version specific information and is only intended for
+  // logging purposes and will not be interpreted by the server.
+  optional string client_type = 4;
+
+  // (Optional)
+  // Last already processed response id from the response stream.
+  // After reattach, server will resume the response stream after that response.
+  // If not specified, server will restart the stream from the start.
+  //
+  // Note: server controls the amount of responses that it buffers and it may drop responses,
+  // that are far behind the latest returned response, so this can't be used to arbitrarily
+  // scroll back the cursor. If the response is no longer available, this will result in an error.
+  optional string last_response_id = 5;
+}
+
+message ReleaseExecuteRequest {
+  // (Required)
+  //
+  // The session_id of the request to reattach to.
+  // This must be an id of existing session.
+  string session_id = 1;
+
+  // (Required) User context
+  //
+  // user_context.user_id and session+id both identify a unique remote spark session on the
+  // server side.
+  UserContext user_context = 2;
+
+  // (Required)
+  // Provide an id of the request to reattach to.
+  // This must be an id of existing operation.
+  string operation_id = 3;
+
+  // Provides optional information about the client sending the request. This field
+  // can be used for language or version specific information and is only intended for
+  // logging purposes and will not be interpreted by the server.
+  optional string client_type = 4;
+
+  // Release and close operation completely.
+  // This will also interrupt the query if it is running execution, and wait for it to be torn down.
+  message ReleaseAll {}
+
+  // Release all responses from the operation response stream up to and including
+  // the response with the given by response_id.
+  // While server determines by itself how much of a buffer of responses to keep, client providing
+  // explicit release calls will help reduce resource consumption.
+  // Noop if response_id not found in cached responses.
+  message ReleaseUntil {
+    string response_id = 1;
+  }
+
+  oneof release {
+    ReleaseAll release_all = 5;
+    ReleaseUntil release_until = 6;
+  }
+}
+
+message ReleaseExecuteResponse {
+  // Session id in which the release was running.
+  string session_id = 1;
+
+  // Operation id of the operation on which the release executed.
+  // If the operation couldn't be found (because e.g. it was concurrently released), will be unset.
+  // Otherwise, it will be equal to the operation_id from request.
+  optional string operation_id = 2;
+}
+
+// Main interface for the SparkConnect service.
+service SparkConnectService {
+
+  // Executes a request that contains the query and returns a stream of [[Response]].
+  //
+  // It is guaranteed that there is at least one ARROW batch returned even if the result set is empty.
+  rpc ExecutePlan(ExecutePlanRequest) returns (stream ExecutePlanResponse) {}
+
+  // Analyzes a query and returns a [[AnalyzeResponse]] containing metadata about the query.
+  rpc AnalyzePlan(AnalyzePlanRequest) returns (AnalyzePlanResponse) {}
+
+  // Update or fetch the configurations and returns a [[ConfigResponse]] containing the result.
+  rpc Config(ConfigRequest) returns (ConfigResponse) {}
+
+  // Add artifacts to the session and returns a [[AddArtifactsResponse]] containing metadata about
+  // the added artifacts.
+  rpc AddArtifacts(stream AddArtifactsRequest) returns (AddArtifactsResponse) {}
+
+  // Check statuses of artifacts in the session and returns them in a [[ArtifactStatusesResponse]]
+  rpc ArtifactStatus(ArtifactStatusesRequest) returns (ArtifactStatusesResponse) {}
+
+  // Interrupts running executions
+  rpc Interrupt(InterruptRequest) returns (InterruptResponse) {}
+
+  // Reattach to an existing reattachable execution.
+  // The ExecutePlan must have been started with ReattachOptions.reattachable=true.
+  // If the ExecutePlanResponse stream ends without a ResultComplete message, there is more to
+  // continue. If there is a ResultComplete, the client should use ReleaseExecute with
+  rpc ReattachExecute(ReattachExecuteRequest) returns (stream ExecutePlanResponse) {}
+
+  // Release an reattachable execution, or parts thereof.
+  // The ExecutePlan must have been started with ReattachOptions.reattachable=true.
+  // Non reattachable executions are released automatically and immediately after the ExecutePlan
+  // RPC and ReleaseExecute may not be used.
+  rpc ReleaseExecute(ReleaseExecuteRequest) returns (ReleaseExecuteResponse) {}
+}
+
diff --git a/velox/functions/sparksql/fuzzer/proto/spark/connect/catalog.proto b/velox/functions/sparksql/fuzzer/proto/spark/connect/catalog.proto
new file mode 100644
index 0000000000000..5b1b90b0087d0
--- /dev/null
+++ b/velox/functions/sparksql/fuzzer/proto/spark/connect/catalog.proto
@@ -0,0 +1,243 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = 'proto3';
+
+package spark.connect;
+
+import "spark/connect/common.proto";
+import "spark/connect/types.proto";
+
+option java_multiple_files = true;
+option java_package = "org.apache.spark.connect.proto";
+option go_package = "internal/generated";
+
+// Catalog messages are marked as unstable.
+message Catalog {
+  oneof cat_type {
+    CurrentDatabase current_database = 1;
+    SetCurrentDatabase set_current_database = 2;
+    ListDatabases list_databases = 3;
+    ListTables list_tables = 4;
+    ListFunctions list_functions = 5;
+    ListColumns list_columns = 6;
+    GetDatabase get_database = 7;
+    GetTable get_table = 8;
+    GetFunction get_function = 9;
+    DatabaseExists database_exists = 10;
+    TableExists table_exists = 11;
+    FunctionExists function_exists = 12;
+    CreateExternalTable create_external_table = 13;
+    CreateTable create_table = 14;
+    DropTempView drop_temp_view = 15;
+    DropGlobalTempView drop_global_temp_view = 16;
+    RecoverPartitions recover_partitions = 17;
+    IsCached is_cached = 18;
+    CacheTable cache_table = 19;
+    UncacheTable uncache_table = 20;
+    ClearCache clear_cache = 21;
+    RefreshTable refresh_table = 22;
+    RefreshByPath refresh_by_path = 23;
+    CurrentCatalog current_catalog = 24;
+    SetCurrentCatalog set_current_catalog = 25;
+    ListCatalogs list_catalogs = 26;
+  }
+}
+
+// See `spark.catalog.currentDatabase`
+message CurrentDatabase { }
+
+// See `spark.catalog.setCurrentDatabase`
+message SetCurrentDatabase {
+  // (Required)
+  string db_name = 1;
+}
+
+// See `spark.catalog.listDatabases`
+message ListDatabases {
+  // (Optional) The pattern that the database name needs to match
+  optional string pattern = 1;
+}
+
+// See `spark.catalog.listTables`
+message ListTables {
+  // (Optional)
+  optional string db_name = 1;
+  // (Optional) The pattern that the table name needs to match
+  optional string pattern = 2;
+}
+
+// See `spark.catalog.listFunctions`
+message ListFunctions {
+  // (Optional)
+  optional string db_name = 1;
+  // (Optional) The pattern that the function name needs to match
+  optional string pattern = 2;
+}
+
+// See `spark.catalog.listColumns`
+message ListColumns {
+  // (Required)
+  string table_name = 1;
+  // (Optional)
+  optional string db_name = 2;
+}
+
+// See `spark.catalog.getDatabase`
+message GetDatabase {
+  // (Required)
+  string db_name = 1;
+}
+
+// See `spark.catalog.getTable`
+message GetTable {
+  // (Required)
+  string table_name = 1;
+  // (Optional)
+  optional string db_name = 2;
+}
+
+// See `spark.catalog.getFunction`
+message GetFunction {
+  // (Required)
+  string function_name = 1;
+  // (Optional)
+  optional string db_name = 2;
+}
+
+// See `spark.catalog.databaseExists`
+message DatabaseExists {
+  // (Required)
+  string db_name = 1;
+}
+
+// See `spark.catalog.tableExists`
+message TableExists {
+  // (Required)
+  string table_name = 1;
+  // (Optional)
+  optional string db_name = 2;
+}
+
+// See `spark.catalog.functionExists`
+message FunctionExists {
+  // (Required)
+  string function_name = 1;
+  // (Optional)
+  optional string db_name = 2;
+}
+
+// See `spark.catalog.createExternalTable`
+message CreateExternalTable {
+  // (Required)
+  string table_name = 1;
+  // (Optional)
+  optional string path = 2;
+  // (Optional)
+  optional string source = 3;
+  // (Optional)
+  optional DataType schema = 4;
+  // Options could be empty for valid data source format.
+  // The map key is case insensitive.
+  map<string, string> options = 5;
+}
+
+// See `spark.catalog.createTable`
+message CreateTable {
+  // (Required)
+  string table_name = 1;
+  // (Optional)
+  optional string path = 2;
+  // (Optional)
+  optional string source = 3;
+  // (Optional)
+  optional string description = 4;
+  // (Optional)
+  optional DataType schema = 5;
+  // Options could be empty for valid data source format.
+  // The map key is case insensitive.
+  map<string, string> options = 6;
+}
+
+// See `spark.catalog.dropTempView`
+message DropTempView {
+  // (Required)
+  string view_name = 1;
+}
+
+// See `spark.catalog.dropGlobalTempView`
+message DropGlobalTempView {
+  // (Required)
+  string view_name = 1;
+}
+
+// See `spark.catalog.recoverPartitions`
+message RecoverPartitions {
+  // (Required)
+  string table_name = 1;
+}
+
+// See `spark.catalog.isCached`
+message IsCached {
+  // (Required)
+  string table_name = 1;
+}
+
+// See `spark.catalog.cacheTable`
+message CacheTable {
+  // (Required)
+  string table_name = 1;
+
+  // (Optional)
+  optional StorageLevel storage_level = 2;
+}
+
+// See `spark.catalog.uncacheTable`
+message UncacheTable {
+  // (Required)
+  string table_name = 1;
+}
+
+// See `spark.catalog.clearCache`
+message ClearCache { }
+
+// See `spark.catalog.refreshTable`
+message RefreshTable {
+  // (Required)
+  string table_name = 1;
+}
+
+// See `spark.catalog.refreshByPath`
+message RefreshByPath {
+  // (Required)
+  string path = 1;
+}
+
+// See `spark.catalog.currentCatalog`
+message CurrentCatalog { }
+
+// See `spark.catalog.setCurrentCatalog`
+message SetCurrentCatalog {
+  // (Required)
+  string catalog_name = 1;
+}
+
+// See `spark.catalog.listCatalogs`
+message ListCatalogs {
+  // (Optional) The pattern that the catalog name needs to match
+  optional string pattern = 1;
+}
diff --git a/velox/functions/sparksql/fuzzer/proto/spark/connect/commands.proto b/velox/functions/sparksql/fuzzer/proto/spark/connect/commands.proto
new file mode 100644
index 0000000000000..49b25f099bf2e
--- /dev/null
+++ b/velox/functions/sparksql/fuzzer/proto/spark/connect/commands.proto
@@ -0,0 +1,416 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = 'proto3';
+
+import "google/protobuf/any.proto";
+import "spark/connect/common.proto";
+import "spark/connect/expressions.proto";
+import "spark/connect/relations.proto";
+
+package spark.connect;
+
+option java_multiple_files = true;
+option java_package = "org.apache.spark.connect.proto";
+option go_package = "internal/generated";
+
+// A [[Command]] is an operation that is executed by the server that does not directly consume or
+// produce a relational result.
+message Command {
+  oneof command_type {
+    CommonInlineUserDefinedFunction register_function = 1;
+    WriteOperation write_operation = 2;
+    CreateDataFrameViewCommand create_dataframe_view = 3;
+    WriteOperationV2 write_operation_v2 = 4;
+    SqlCommand sql_command = 5;
+    WriteStreamOperationStart write_stream_operation_start = 6;
+    StreamingQueryCommand streaming_query_command = 7;
+    GetResourcesCommand get_resources_command = 8;
+    StreamingQueryManagerCommand streaming_query_manager_command = 9;
+    CommonInlineUserDefinedTableFunction register_table_function = 10;
+
+    // This field is used to mark extensions to the protocol. When plugins generate arbitrary
+    // Commands they can add them here. During the planning the correct resolution is done.
+    google.protobuf.Any extension = 999;
+
+  }
+}
+
+// A SQL Command is used to trigger the eager evaluation of SQL commands in Spark.
+//
+// When the SQL provide as part of the message is a command it will be immediately evaluated
+// and the result will be collected and returned as part of a LocalRelation. If the result is
+// not a command, the operation will simply return a SQL Relation. This allows the client to be
+// almost oblivious to the server-side behavior.
+message SqlCommand {
+  // (Required) SQL Query.
+  string sql = 1;
+
+  // (Optional) A map of parameter names to literal expressions.
+  map<string, Expression.Literal> args = 2;
+
+  // (Optional) A sequence of literal expressions for positional parameters in the SQL query text.
+  repeated Expression.Literal pos_args = 3;
+}
+
+// A command that can create DataFrame global temp view or local temp view.
+message CreateDataFrameViewCommand {
+  // (Required) The relation that this view will be built on.
+  Relation input = 1;
+
+  // (Required) View name.
+  string name = 2;
+
+  // (Required) Whether this is global temp view or local temp view.
+  bool is_global = 3;
+
+  // (Required)
+  //
+  // If true, and if the view already exists, updates it; if false, and if the view
+  // already exists, throws exception.
+  bool replace = 4;
+}
+
+// As writes are not directly handled during analysis and planning, they are modeled as commands.
+message WriteOperation {
+  // (Required) The output of the `input` relation will be persisted according to the options.
+  Relation input = 1;
+
+  // (Optional) Format value according to the Spark documentation. Examples are: text, parquet, delta.
+  optional string source = 2;
+
+  // (Optional)
+  //
+  // The destination of the write operation can be either a path or a table.
+  // If the destination is neither a path nor a table, such as jdbc and noop,
+  // the `save_type` should not be set.
+  oneof save_type {
+    string path = 3;
+    SaveTable table = 4;
+  }
+
+  // (Required) the save mode.
+  SaveMode mode = 5;
+
+  // (Optional) List of columns to sort the output by.
+  repeated string sort_column_names = 6;
+
+  // (Optional) List of columns for partitioning.
+  repeated string partitioning_columns = 7;
+
+  // (Optional) Bucketing specification. Bucketing must set the number of buckets and the columns
+  // to bucket by.
+  BucketBy bucket_by = 8;
+
+  // (Optional) A list of configuration options.
+  map<string, string> options = 9;
+
+  message SaveTable {
+    // (Required) The table name.
+    string table_name = 1;
+    // (Required) The method to be called to write to the table.
+    TableSaveMethod save_method = 2;
+
+    enum TableSaveMethod {
+      TABLE_SAVE_METHOD_UNSPECIFIED = 0;
+      TABLE_SAVE_METHOD_SAVE_AS_TABLE = 1;
+      TABLE_SAVE_METHOD_INSERT_INTO = 2;
+    }
+  }
+
+  message BucketBy {
+    repeated string bucket_column_names = 1;
+    int32 num_buckets = 2;
+  }
+
+  enum SaveMode {
+    SAVE_MODE_UNSPECIFIED = 0;
+    SAVE_MODE_APPEND = 1;
+    SAVE_MODE_OVERWRITE = 2;
+    SAVE_MODE_ERROR_IF_EXISTS = 3;
+    SAVE_MODE_IGNORE = 4;
+  }
+}
+
+// As writes are not directly handled during analysis and planning, they are modeled as commands.
+message WriteOperationV2 {
+  // (Required) The output of the `input` relation will be persisted according to the options.
+  Relation input = 1;
+
+  // (Required) The destination of the write operation must be either a path or a table.
+  string table_name = 2;
+
+  // (Optional) A provider for the underlying output data source. Spark's default catalog supports
+  // "parquet", "json", etc.
+  optional string provider = 3;
+
+  // (Optional) List of columns for partitioning for output table created by `create`,
+  // `createOrReplace`, or `replace`
+  repeated Expression partitioning_columns = 4;
+
+  // (Optional) A list of configuration options.
+  map<string, string> options = 5;
+
+  // (Optional) A list of table properties.
+  map<string, string> table_properties = 6;
+
+  // (Required) Write mode.
+  Mode mode = 7;
+
+  enum Mode {
+    MODE_UNSPECIFIED = 0;
+    MODE_CREATE = 1;
+    MODE_OVERWRITE = 2;
+    MODE_OVERWRITE_PARTITIONS = 3;
+    MODE_APPEND = 4;
+    MODE_REPLACE = 5;
+    MODE_CREATE_OR_REPLACE = 6;
+  }
+
+  // (Optional) A condition for overwrite saving mode
+  Expression overwrite_condition = 8;
+}
+
+// Starts write stream operation as streaming query. Query ID and Run ID of the streaming
+// query are returned.
+message WriteStreamOperationStart {
+
+  // (Required) The output of the `input` streaming relation will be written.
+  Relation input = 1;
+
+  // The following fields directly map to API for DataStreamWriter().
+  // Consult API documentation unless explicitly documented here.
+
+  string format = 2;
+  map<string, string> options = 3;
+  repeated string partitioning_column_names = 4;
+
+  oneof trigger {
+    string processing_time_interval = 5;
+    bool available_now = 6;
+    bool once = 7;
+    string continuous_checkpoint_interval = 8;
+  }
+
+  string output_mode = 9;
+  string query_name = 10;
+
+  // The destination is optional. When set, it can be a path or a table name.
+  oneof sink_destination {
+    string path = 11;
+    string table_name = 12;
+  }
+
+  StreamingForeachFunction foreach_writer = 13;
+  StreamingForeachFunction foreach_batch = 14;
+}
+
+message StreamingForeachFunction {
+  oneof function {
+    PythonUDF python_function = 1;
+    ScalarScalaUDF scala_function = 2;
+  }
+}
+
+message WriteStreamOperationStartResult {
+
+  // (Required) Query instance. See `StreamingQueryInstanceId`.
+  StreamingQueryInstanceId query_id = 1;
+
+  // An optional query name.
+  string name = 2;
+
+  // TODO: How do we indicate errors?
+  // TODO: Consider adding status, last progress etc here.
+}
+
+// A tuple that uniquely identifies an instance of streaming query run. It consists of `id` that
+// persists across the streaming runs and `run_id` that changes between each run of the
+// streaming query that resumes from the checkpoint.
+message StreamingQueryInstanceId {
+
+  // (Required) The unique id of this query that persists across restarts from checkpoint data.
+  // That is, this id is generated when a query is started for the first time, and
+  // will be the same every time it is restarted from checkpoint data.
+  string id = 1;
+
+  // (Required) The unique id of this run of the query. That is, every start/restart of a query
+  // will generate a unique run_id. Therefore, every time a query is restarted from
+  // checkpoint, it will have the same `id` but different `run_id`s.
+  string run_id = 2;
+}
+
+// Commands for a streaming query.
+message StreamingQueryCommand {
+
+  // (Required) Query instance. See `StreamingQueryInstanceId`.
+  StreamingQueryInstanceId query_id = 1;
+
+  // See documentation for the corresponding API method in StreamingQuery.
+  oneof command {
+    // status() API.
+    bool status = 2;
+    // lastProgress() API.
+    bool last_progress = 3;
+    // recentProgress() API.
+    bool recent_progress = 4;
+    // stop() API. Stops the query.
+    bool stop = 5;
+    // processAllAvailable() API. Waits till all the available data is processed
+    bool process_all_available = 6;
+    // explain() API. Returns logical and physical plans.
+    ExplainCommand explain = 7;
+    // exception() API. Returns the exception in the query if any.
+    bool exception = 8;
+    // awaitTermination() API. Waits for the termination of the query.
+    AwaitTerminationCommand await_termination = 9;
+  }
+
+  message ExplainCommand {
+    // TODO: Consider reusing Explain from AnalyzePlanRequest message.
+    //       We can not do this right now since it base.proto imports this file.
+    bool extended = 1;
+  }
+
+  message AwaitTerminationCommand {
+    optional int64 timeout_ms = 2;
+  }
+}
+
+// Response for commands on a streaming query.
+message StreamingQueryCommandResult {
+  // (Required) Query instance id. See `StreamingQueryInstanceId`.
+  StreamingQueryInstanceId query_id = 1;
+
+  oneof result_type {
+    StatusResult status = 2;
+    RecentProgressResult recent_progress = 3;
+    ExplainResult explain = 4;
+    ExceptionResult exception = 5;
+    AwaitTerminationResult await_termination = 6;
+  }
+
+  message StatusResult {
+    // See documentation for these Scala 'StreamingQueryStatus' struct
+    string status_message = 1;
+    bool is_data_available = 2;
+    bool is_trigger_active = 3;
+    bool is_active = 4;
+  }
+
+  message RecentProgressResult {
+    // Progress reports as an array of json strings.
+    repeated string recent_progress_json = 5;
+  }
+
+  message ExplainResult {
+    // Logical and physical plans as string
+    string result = 1;
+  }
+
+  message ExceptionResult {
+    // (Optional) Exception message as string, maps to the return value of original
+    // StreamingQueryException's toString method
+    optional string exception_message = 1;
+    // (Optional) Exception error class as string
+    optional string error_class = 2;
+    // (Optional) Exception stack trace as string
+    optional string stack_trace = 3;
+  }
+
+  message AwaitTerminationResult {
+    bool terminated = 1;
+  }
+}
+
+// Commands for the streaming query manager.
+message StreamingQueryManagerCommand {
+
+  // See documentation for the corresponding API method in StreamingQueryManager.
+  oneof command {
+    // active() API, returns a list of active queries.
+    bool active = 1;
+    // get() API, returns the StreamingQuery identified by id.
+    string get_query = 2;
+    // awaitAnyTermination() API, wait until any query terminates or timeout.
+    AwaitAnyTerminationCommand await_any_termination = 3;
+    // resetTerminated() API.
+    bool reset_terminated = 4;
+    // addListener API.
+    StreamingQueryListenerCommand add_listener = 5;
+    // removeListener API.
+    StreamingQueryListenerCommand remove_listener = 6;
+    // listListeners() API, returns a list of streaming query listeners.
+    bool list_listeners = 7;
+  }
+
+  message AwaitAnyTerminationCommand {
+    // (Optional) The waiting time in milliseconds to wait for any query to terminate.
+    optional int64 timeout_ms = 1;
+  }
+
+  message StreamingQueryListenerCommand {
+    bytes listener_payload = 1;
+    optional PythonUDF python_listener_payload = 2;
+    string id = 3;
+  }
+}
+
+// Response for commands on the streaming query manager.
+message StreamingQueryManagerCommandResult {
+  oneof result_type {
+    ActiveResult active = 1;
+    StreamingQueryInstance query = 2;
+    AwaitAnyTerminationResult await_any_termination = 3;
+    bool reset_terminated = 4;
+    bool add_listener = 5;
+    bool remove_listener = 6;
+    ListStreamingQueryListenerResult list_listeners = 7;
+  }
+
+  message ActiveResult {
+    repeated StreamingQueryInstance active_queries = 1;
+  }
+
+  message StreamingQueryInstance {
+    // (Required) The id and runId of this query.
+    StreamingQueryInstanceId id = 1;
+    // (Optional) The name of this query.
+    optional string name = 2;
+  }
+
+  message AwaitAnyTerminationResult {
+    bool terminated = 1;
+  }
+
+  message StreamingQueryListenerInstance {
+    bytes listener_payload = 1;
+  }
+
+  message ListStreamingQueryListenerResult {
+    // (Required) Reference IDs of listener instances.
+    repeated string listener_ids = 1;
+  }
+}
+
+// Command to get the output of 'SparkContext.resources'
+message GetResourcesCommand { }
+
+// Response for command 'GetResourcesCommand'.
+message GetResourcesCommandResult {
+  map<string, ResourceInformation> resources = 1;
+}
diff --git a/velox/functions/sparksql/fuzzer/proto/spark/connect/common.proto b/velox/functions/sparksql/fuzzer/proto/spark/connect/common.proto
new file mode 100644
index 0000000000000..5c538cf10825e
--- /dev/null
+++ b/velox/functions/sparksql/fuzzer/proto/spark/connect/common.proto
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = 'proto3';
+
+package spark.connect;
+
+option java_multiple_files = true;
+option java_package = "org.apache.spark.connect.proto";
+option go_package = "internal/generated";
+
+// StorageLevel for persisting Datasets/Tables.
+message StorageLevel {
+  // (Required) Whether the cache should use disk or not.
+  bool use_disk = 1;
+  // (Required) Whether the cache should use memory or not.
+  bool use_memory = 2;
+  // (Required) Whether the cache should use off-heap or not.
+  bool use_off_heap = 3;
+  // (Required) Whether the cached data is deserialized or not.
+  bool deserialized = 4;
+  // (Required) The number of replicas.
+  int32 replication = 5;
+}
+
+
+// ResourceInformation to hold information about a type of Resource.
+// The corresponding class is 'org.apache.spark.resource.ResourceInformation'
+message ResourceInformation {
+  // (Required) The name of the resource
+  string name = 1;
+  // (Required) An array of strings describing the addresses of the resource.
+  repeated string addresses = 2;
+}
diff --git a/velox/functions/sparksql/fuzzer/proto/spark/connect/example_plugins.proto b/velox/functions/sparksql/fuzzer/proto/spark/connect/example_plugins.proto
new file mode 100644
index 0000000000000..7ad171d2e8a1e
--- /dev/null
+++ b/velox/functions/sparksql/fuzzer/proto/spark/connect/example_plugins.proto
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = 'proto3';
+
+import "spark/connect/relations.proto";
+import "spark/connect/expressions.proto";
+option go_package = "internal/generated";
+
+package spark.connect;
+
+option java_multiple_files = true;
+option java_package = "org.apache.spark.connect.proto";
+
+message ExamplePluginRelation {
+  Relation input = 1;
+  string custom_field = 2;
+
+}
+
+message ExamplePluginExpression {
+  Expression child = 1;
+  string custom_field = 2;
+}
+
+message ExamplePluginCommand {
+  string custom_field = 1;
+}
\ No newline at end of file
diff --git a/velox/functions/sparksql/fuzzer/proto/spark/connect/expressions.proto b/velox/functions/sparksql/fuzzer/proto/spark/connect/expressions.proto
new file mode 100644
index 0000000000000..557b9db912376
--- /dev/null
+++ b/velox/functions/sparksql/fuzzer/proto/spark/connect/expressions.proto
@@ -0,0 +1,382 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = 'proto3';
+
+import "google/protobuf/any.proto";
+import "spark/connect/types.proto";
+
+package spark.connect;
+
+option java_multiple_files = true;
+option java_package = "org.apache.spark.connect.proto";
+option go_package = "internal/generated";
+
+// Expression used to refer to fields, functions and similar. This can be used everywhere
+// expressions in SQL appear.
+message Expression {
+
+  oneof expr_type {
+    Literal literal = 1;
+    UnresolvedAttribute unresolved_attribute = 2;
+    UnresolvedFunction unresolved_function = 3;
+    ExpressionString expression_string = 4;
+    UnresolvedStar unresolved_star = 5;
+    Alias alias = 6;
+    Cast cast = 7;
+    UnresolvedRegex unresolved_regex = 8;
+    SortOrder sort_order = 9;
+    LambdaFunction lambda_function = 10;
+    Window window = 11;
+    UnresolvedExtractValue unresolved_extract_value = 12;
+    UpdateFields update_fields = 13;
+    UnresolvedNamedLambdaVariable unresolved_named_lambda_variable = 14;
+    CommonInlineUserDefinedFunction common_inline_user_defined_function = 15;
+    CallFunction call_function = 16;
+
+    // This field is used to mark extensions to the protocol. When plugins generate arbitrary
+    // relations they can add them here. During the planning the correct resolution is done.
+    google.protobuf.Any extension = 999;
+  }
+
+
+  // Expression for the OVER clause or WINDOW clause.
+  message Window {
+
+    // (Required) The window function.
+    Expression window_function = 1;
+
+    // (Optional) The way that input rows are partitioned.
+    repeated Expression partition_spec = 2;
+
+    // (Optional) Ordering of rows in a partition.
+    repeated SortOrder order_spec = 3;
+
+    // (Optional) Window frame in a partition.
+    //
+    // If not set, it will be treated as 'UnspecifiedFrame'.
+    WindowFrame frame_spec = 4;
+
+    // The window frame
+    message WindowFrame {
+
+      // (Required) The type of the frame.
+      FrameType frame_type = 1;
+
+      // (Required) The lower bound of the frame.
+      FrameBoundary lower = 2;
+
+      // (Required) The upper bound of the frame.
+      FrameBoundary upper = 3;
+
+      enum FrameType {
+        FRAME_TYPE_UNDEFINED = 0;
+
+        // RowFrame treats rows in a partition individually.
+        FRAME_TYPE_ROW = 1;
+
+        // RangeFrame treats rows in a partition as groups of peers.
+        // All rows having the same 'ORDER BY' ordering are considered as peers.
+        FRAME_TYPE_RANGE = 2;
+      }
+
+      message FrameBoundary {
+        oneof boundary {
+          // CURRENT ROW boundary
+          bool current_row = 1;
+
+          // UNBOUNDED boundary.
+          // For lower bound, it will be converted to 'UnboundedPreceding'.
+          // for upper bound, it will be converted to 'UnboundedFollowing'.
+          bool unbounded = 2;
+
+          // This is an expression for future proofing. We are expecting literals on the server side.
+          Expression value = 3;
+        }
+      }
+    }
+  }
+
+  // SortOrder is used to specify the  data ordering, it is normally used in Sort and Window.
+  // It is an unevaluable expression and cannot be evaluated, so can not be used in Projection.
+  message SortOrder {
+    // (Required) The expression to be sorted.
+    Expression child = 1;
+
+    // (Required) The sort direction, should be ASCENDING or DESCENDING.
+    SortDirection direction = 2;
+
+    // (Required) How to deal with NULLs, should be NULLS_FIRST or NULLS_LAST.
+    NullOrdering null_ordering = 3;
+
+    enum SortDirection {
+      SORT_DIRECTION_UNSPECIFIED = 0;
+      SORT_DIRECTION_ASCENDING = 1;
+      SORT_DIRECTION_DESCENDING = 2;
+    }
+
+    enum NullOrdering {
+      SORT_NULLS_UNSPECIFIED = 0;
+      SORT_NULLS_FIRST = 1;
+      SORT_NULLS_LAST = 2;
+    }
+  }
+
+  message Cast {
+    // (Required) the expression to be casted.
+    Expression expr = 1;
+
+    // (Required) the data type that the expr to be casted to.
+    oneof cast_to_type {
+      DataType type = 2;
+      // If this is set, Server will use Catalyst parser to parse this string to DataType.
+      string type_str = 3;
+    }
+  }
+
+  message Literal {
+    oneof literal_type {
+      DataType null = 1;
+      bytes binary = 2;
+      bool boolean = 3;
+
+      int32 byte = 4;
+      int32 short = 5;
+      int32 integer = 6;
+      int64 long = 7;
+      float float = 10;
+      double double = 11;
+      Decimal decimal = 12;
+
+      string string = 13;
+
+      // Date in units of days since the UNIX epoch.
+      int32 date = 16;
+      // Timestamp in units of microseconds since the UNIX epoch.
+      int64 timestamp = 17;
+      // Timestamp in units of microseconds since the UNIX epoch (without timezone information).
+      int64 timestamp_ntz = 18;
+
+      CalendarInterval calendar_interval = 19;
+      int32 year_month_interval = 20;
+      int64 day_time_interval = 21;
+      Array array = 22;
+      Map map = 23;
+      Struct struct = 24;
+    }
+
+    message Decimal {
+      // the string representation.
+      string value = 1;
+      // The maximum number of digits allowed in the value.
+      // the maximum precision is 38.
+      optional int32 precision = 2;
+      // declared scale of decimal literal
+      optional int32 scale = 3;
+    }
+
+    message CalendarInterval {
+      int32 months = 1;
+      int32 days = 2;
+      int64 microseconds = 3;
+    }
+
+    message Array {
+      DataType element_type = 1;
+      repeated Literal elements = 2;
+    }
+
+    message Map {
+      DataType key_type = 1;
+      DataType value_type = 2;
+      repeated Literal keys = 3;
+      repeated Literal values = 4;
+    }
+
+    message Struct {
+      DataType struct_type = 1;
+      repeated Literal elements = 2;
+    }
+  }
+
+  // An unresolved attribute that is not explicitly bound to a specific column, but the column
+  // is resolved during analysis by name.
+  message UnresolvedAttribute {
+    // (Required) An identifier that will be parsed by Catalyst parser. This should follow the
+    // Spark SQL identifier syntax.
+    string unparsed_identifier = 1;
+
+    // (Optional) The id of corresponding connect plan.
+    optional int64 plan_id = 2;
+  }
+
+  // An unresolved function is not explicitly bound to one explicit function, but the function
+  // is resolved during analysis following Sparks name resolution rules.
+  message UnresolvedFunction {
+    // (Required) name (or unparsed name for user defined function) for the unresolved function.
+    string function_name = 1;
+
+    // (Optional) Function arguments. Empty arguments are allowed.
+    repeated Expression arguments = 2;
+
+    // (Required) Indicate if this function should be applied on distinct values.
+    bool is_distinct = 3;
+
+    // (Required) Indicate if this is a user defined function.
+    //
+    // When it is not a user defined function, Connect will use the function name directly.
+    // When it is a user defined function, Connect will parse the function name first.
+    bool is_user_defined_function = 4;
+  }
+
+  // Expression as string.
+  message ExpressionString {
+    // (Required) A SQL expression that will be parsed by Catalyst parser.
+    string expression = 1;
+  }
+
+  // UnresolvedStar is used to expand all the fields of a relation or struct.
+  message UnresolvedStar {
+
+    // (Optional) The target of the expansion.
+    //
+    // If set, it should end with '.*' and will be parsed by 'parseAttributeName'
+    // in the server side.
+    optional string unparsed_target = 1;
+  }
+
+  // Represents all of the input attributes to a given relational operator, for example in
+  // "SELECT `(id)?+.+` FROM ...".
+  message UnresolvedRegex {
+    // (Required) The column name used to extract column with regex.
+    string col_name = 1;
+
+    // (Optional) The id of corresponding connect plan.
+    optional int64 plan_id = 2;
+  }
+
+  // Extracts a value or values from an Expression
+  message UnresolvedExtractValue {
+    // (Required) The expression to extract value from, can be
+    // Map, Array, Struct or array of Structs.
+    Expression child = 1;
+
+    // (Required) The expression to describe the extraction, can be
+    // key of Map, index of Array, field name of Struct.
+    Expression extraction = 2;
+  }
+
+  // Add, replace or drop a field of `StructType` expression by name.
+  message UpdateFields {
+    // (Required) The struct expression.
+    Expression struct_expression = 1;
+
+    // (Required) The field name.
+    string field_name = 2;
+
+    // (Optional) The expression to add or replace.
+    //
+    // When not set, it means this field will be dropped.
+    Expression value_expression = 3;
+  }
+
+  message Alias {
+    // (Required) The expression that alias will be added on.
+    Expression expr = 1;
+
+    // (Required) a list of name parts for the alias.
+    //
+    // Scalar columns only has one name that presents.
+    repeated string name = 2;
+
+    // (Optional) Alias metadata expressed as a JSON map.
+    optional string metadata = 3;
+  }
+
+  message LambdaFunction {
+    // (Required) The lambda function.
+    //
+    // The function body should use 'UnresolvedAttribute' as arguments, the sever side will
+    // replace 'UnresolvedAttribute' with 'UnresolvedNamedLambdaVariable'.
+    Expression function = 1;
+
+    // (Required) Function variables. Must contains 1 ~ 3 variables.
+    repeated Expression.UnresolvedNamedLambdaVariable arguments = 2;
+  }
+
+  message UnresolvedNamedLambdaVariable {
+
+    // (Required) a list of name parts for the variable. Must not be empty.
+    repeated string name_parts = 1;
+  }
+}
+
+message CommonInlineUserDefinedFunction {
+  // (Required) Name of the user-defined function.
+  string function_name = 1;
+  // (Optional) Indicate if the user-defined function is deterministic.
+  bool deterministic = 2;
+  // (Optional) Function arguments. Empty arguments are allowed.
+  repeated Expression arguments = 3;
+  // (Required) Indicate the function type of the user-defined function.
+  oneof function {
+    PythonUDF python_udf = 4;
+    ScalarScalaUDF scalar_scala_udf = 5;
+    JavaUDF java_udf = 6;
+  }
+}
+
+message PythonUDF {
+  // (Required) Output type of the Python UDF
+  DataType output_type = 1;
+  // (Required) EvalType of the Python UDF
+  int32 eval_type = 2;
+  // (Required) The encoded commands of the Python UDF
+  bytes command = 3;
+  // (Required) Python version being used in the client.
+  string python_ver = 4;
+}
+
+message ScalarScalaUDF {
+  // (Required) Serialized JVM object containing UDF definition, input encoders and output encoder
+  bytes payload = 1;
+  // (Optional) Input type(s) of the UDF
+  repeated DataType inputTypes = 2;
+  // (Required) Output type of the UDF
+  DataType outputType = 3;
+  // (Required) True if the UDF can return null value
+  bool nullable = 4;
+}
+
+message JavaUDF {
+  // (Required) Fully qualified name of Java class
+  string class_name = 1;
+
+  // (Optional) Output type of the Java UDF
+  optional DataType output_type = 2;
+
+  // (Required) Indicate if the Java user-defined function is an aggregate function
+  bool aggregate = 3;
+}
+
+message CallFunction {
+  // (Required) Unparsed name of the SQL function.
+  string function_name = 1;
+
+  // (Optional) Function arguments. Empty arguments are allowed.
+  repeated Expression arguments = 2;
+}
diff --git a/velox/functions/sparksql/fuzzer/proto/spark/connect/relations.proto b/velox/functions/sparksql/fuzzer/proto/spark/connect/relations.proto
new file mode 100644
index 0000000000000..f7f1315ede0f8
--- /dev/null
+++ b/velox/functions/sparksql/fuzzer/proto/spark/connect/relations.proto
@@ -0,0 +1,1003 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = 'proto3';
+
+package spark.connect;
+
+import "google/protobuf/any.proto";
+import "spark/connect/expressions.proto";
+import "spark/connect/types.proto";
+import "spark/connect/catalog.proto";
+
+option java_multiple_files = true;
+option java_package = "org.apache.spark.connect.proto";
+option go_package = "internal/generated";
+
+// The main [[Relation]] type. Fundamentally, a relation is a typed container
+// that has exactly one explicit relation type set.
+//
+// When adding new relation types, they have to be registered here.
+message Relation {
+  RelationCommon common = 1;
+  oneof rel_type {
+    Read read = 2;
+    Project project = 3;
+    Filter filter = 4;
+    Join join = 5;
+    SetOperation set_op = 6;
+    Sort sort = 7;
+    Limit limit = 8;
+    Aggregate aggregate = 9;
+    SQL sql = 10;
+    LocalRelation local_relation = 11;
+    Sample sample = 12;
+    Offset offset = 13;
+    Deduplicate deduplicate = 14;
+    Range range = 15;
+    SubqueryAlias subquery_alias = 16;
+    Repartition repartition = 17;
+    ToDF to_df = 18;
+    WithColumnsRenamed with_columns_renamed = 19;
+    ShowString show_string = 20;
+    Drop drop = 21;
+    Tail tail = 22;
+    WithColumns with_columns = 23;
+    Hint hint = 24;
+    Unpivot unpivot = 25;
+    ToSchema to_schema = 26;
+    RepartitionByExpression repartition_by_expression = 27;
+    MapPartitions map_partitions = 28;
+    CollectMetrics collect_metrics = 29;
+    Parse parse = 30;
+    GroupMap group_map = 31;
+    CoGroupMap co_group_map = 32;
+    WithWatermark with_watermark = 33;
+    ApplyInPandasWithState apply_in_pandas_with_state = 34;
+    HtmlString html_string = 35;
+    CachedLocalRelation cached_local_relation = 36;
+    CachedRemoteRelation cached_remote_relation = 37;
+    CommonInlineUserDefinedTableFunction common_inline_user_defined_table_function = 38;
+
+    // NA functions
+    NAFill fill_na = 90;
+    NADrop drop_na = 91;
+    NAReplace replace = 92;
+
+    // stat functions
+    StatSummary summary = 100;
+    StatCrosstab crosstab = 101;
+    StatDescribe describe = 102;
+    StatCov cov = 103;
+    StatCorr corr = 104;
+    StatApproxQuantile approx_quantile = 105;
+    StatFreqItems freq_items = 106;
+    StatSampleBy sample_by = 107;
+
+    // Catalog API (experimental / unstable)
+    Catalog catalog = 200;
+
+    // This field is used to mark extensions to the protocol. When plugins generate arbitrary
+    // relations they can add them here. During the planning the correct resolution is done.
+    google.protobuf.Any extension = 998;
+    Unknown unknown = 999;
+  }
+}
+
+// Used for testing purposes only.
+message Unknown {}
+
+// Common metadata of all relations.
+message RelationCommon {
+  // (Required) Shared relation metadata.
+  string source_info = 1;
+
+  // (Optional) A per-client globally unique id for a given connect plan.
+  optional int64 plan_id = 2;
+}
+
+// Relation that uses a SQL query to generate the output.
+message SQL {
+  // (Required) The SQL query.
+  string query = 1;
+
+  // (Optional) A map of parameter names to literal expressions.
+  map<string, Expression.Literal> args = 2;
+
+  // (Optional) A sequence of literal expressions for positional parameters in the SQL query text.
+  repeated Expression.Literal pos_args = 3;
+}
+
+// Relation that reads from a file / table or other data source. Does not have additional
+// inputs.
+message Read {
+  oneof read_type {
+    NamedTable named_table = 1;
+    DataSource data_source = 2;
+  }
+
+  // (Optional) Indicates if this is a streaming read.
+  bool is_streaming = 3;
+
+  message NamedTable {
+    // (Required) Unparsed identifier for the table.
+    string unparsed_identifier = 1;
+
+    // Options for the named table. The map key is case insensitive.
+    map<string, string> options = 2;
+  }
+
+  message DataSource {
+    // (Optional) Supported formats include: parquet, orc, text, json, parquet, csv, avro.
+    //
+    // If not set, the value from SQL conf 'spark.sql.sources.default' will be used.
+    optional string format = 1;
+
+    // (Optional) If not set, Spark will infer the schema.
+    //
+    // This schema string should be either DDL-formatted or JSON-formatted.
+    optional string schema = 2;
+
+    // Options for the data source. The context of this map varies based on the
+    // data source format. This options could be empty for valid data source format.
+    // The map key is case insensitive.
+    map<string, string> options = 3;
+
+    // (Optional) A list of path for file-system backed data sources.
+    repeated string paths = 4;
+
+    // (Optional) Condition in the where clause for each partition.
+    //
+    // This is only supported by the JDBC data source.
+    repeated string predicates = 5;
+  }
+}
+
+// Projection of a bag of expressions for a given input relation.
+//
+// The input relation must be specified.
+// The projected expression can be an arbitrary expression.
+message Project {
+  // (Optional) Input relation is optional for Project.
+  //
+  // For example, `SELECT ABS(-1)` is valid plan without an input plan.
+  Relation input = 1;
+
+  // (Required) A Project requires at least one expression.
+  repeated Expression expressions = 3;
+}
+
+// Relation that applies a boolean expression `condition` on each row of `input` to produce
+// the output result.
+message Filter {
+  // (Required) Input relation for a Filter.
+  Relation input = 1;
+
+  // (Required) A Filter must have a condition expression.
+  Expression condition = 2;
+}
+
+// Relation of type [[Join]].
+//
+// `left` and `right` must be present.
+message Join {
+  // (Required) Left input relation for a Join.
+  Relation left = 1;
+
+  // (Required) Right input relation for a Join.
+  Relation right = 2;
+
+  // (Optional) The join condition. Could be unset when `using_columns` is utilized.
+  //
+  // This field does not co-exist with using_columns.
+  Expression join_condition = 3;
+
+  // (Required) The join type.
+  JoinType join_type = 4;
+
+  // Optional. using_columns provides a list of columns that should present on both sides of
+  // the join inputs that this Join will join on. For example A JOIN B USING col_name is
+  // equivalent to A JOIN B on A.col_name = B.col_name.
+  //
+  // This field does not co-exist with join_condition.
+  repeated string using_columns = 5;
+
+  enum JoinType {
+    JOIN_TYPE_UNSPECIFIED = 0;
+    JOIN_TYPE_INNER = 1;
+    JOIN_TYPE_FULL_OUTER = 2;
+    JOIN_TYPE_LEFT_OUTER = 3;
+    JOIN_TYPE_RIGHT_OUTER = 4;
+    JOIN_TYPE_LEFT_ANTI = 5;
+    JOIN_TYPE_LEFT_SEMI = 6;
+    JOIN_TYPE_CROSS = 7;
+  }
+
+  // (Optional) Only used by joinWith. Set the left and right join data types.
+  optional JoinDataType join_data_type = 6;
+
+  message JoinDataType {
+    // If the left data type is a struct.
+    bool is_left_struct = 1;
+    // If the right data type is a struct.
+    bool is_right_struct = 2;
+  }
+}
+
+// Relation of type [[SetOperation]]
+message SetOperation {
+  // (Required) Left input relation for a Set operation.
+  Relation left_input = 1;
+
+  // (Required) Right input relation for a Set operation.
+  Relation right_input = 2;
+
+  // (Required) The Set operation type.
+  SetOpType set_op_type = 3;
+
+  // (Optional) If to remove duplicate rows.
+  //
+  // True to preserve all results.
+  // False to remove duplicate rows.
+  optional bool is_all = 4;
+
+  // (Optional) If to perform the Set operation based on name resolution.
+  //
+  // Only UNION supports this option.
+  optional bool by_name = 5;
+
+  // (Optional) If to perform the Set operation and allow missing columns.
+  //
+  // Only UNION supports this option.
+  optional bool allow_missing_columns = 6;
+
+  enum SetOpType {
+    SET_OP_TYPE_UNSPECIFIED = 0;
+    SET_OP_TYPE_INTERSECT = 1;
+    SET_OP_TYPE_UNION = 2;
+    SET_OP_TYPE_EXCEPT = 3;
+  }
+}
+
+// Relation of type [[Limit]] that is used to `limit` rows from the input relation.
+message Limit {
+  // (Required) Input relation for a Limit.
+  Relation input = 1;
+
+  // (Required) the limit.
+  int32 limit = 2;
+}
+
+// Relation of type [[Offset]] that is used to read rows staring from the `offset` on
+// the input relation.
+message Offset {
+  // (Required) Input relation for an Offset.
+  Relation input = 1;
+
+  // (Required) the limit.
+  int32 offset = 2;
+}
+
+// Relation of type [[Tail]] that is used to fetch `limit` rows from the last of the input relation.
+message Tail {
+  // (Required) Input relation for an Tail.
+  Relation input = 1;
+
+  // (Required) the limit.
+  int32 limit = 2;
+}
+
+// Relation of type [[Aggregate]].
+message Aggregate {
+  // (Required) Input relation for a RelationalGroupedDataset.
+  Relation input = 1;
+
+  // (Required) How the RelationalGroupedDataset was built.
+  GroupType group_type = 2;
+
+  // (Required) Expressions for grouping keys
+  repeated Expression grouping_expressions = 3;
+
+  // (Required) List of values that will be translated to columns in the output DataFrame.
+  repeated Expression aggregate_expressions = 4;
+
+  // (Optional) Pivots a column of the current `DataFrame` and performs the specified aggregation.
+  Pivot pivot = 5;
+
+  enum GroupType {
+    GROUP_TYPE_UNSPECIFIED = 0;
+    GROUP_TYPE_GROUPBY = 1;
+    GROUP_TYPE_ROLLUP = 2;
+    GROUP_TYPE_CUBE = 3;
+    GROUP_TYPE_PIVOT = 4;
+  }
+
+  message Pivot {
+    // (Required) The column to pivot
+    Expression col = 1;
+
+    // (Optional) List of values that will be translated to columns in the output DataFrame.
+    //
+    // Note that if it is empty, the server side will immediately trigger a job to collect
+    // the distinct values of the column.
+    repeated Expression.Literal values = 2;
+  }
+}
+
+// Relation of type [[Sort]].
+message Sort {
+  // (Required) Input relation for a Sort.
+  Relation input = 1;
+
+  // (Required) The ordering expressions
+  repeated Expression.SortOrder order = 2;
+
+  // (Optional) if this is a global sort.
+  optional bool is_global = 3;
+}
+
+
+// Drop specified columns.
+message Drop {
+  // (Required) The input relation.
+  Relation input = 1;
+
+  // (Optional) columns to drop.
+  repeated Expression columns = 2;
+
+  // (Optional) names of columns to drop.
+  repeated string column_names = 3;
+}
+
+
+// Relation of type [[Deduplicate]] which have duplicate rows removed, could consider either only
+// the subset of columns or all the columns.
+message Deduplicate {
+  // (Required) Input relation for a Deduplicate.
+  Relation input = 1;
+
+  // (Optional) Deduplicate based on a list of column names.
+  //
+  // This field does not co-use with `all_columns_as_keys`.
+  repeated string column_names = 2;
+
+  // (Optional) Deduplicate based on all the columns of the input relation.
+  //
+  // This field does not co-use with `column_names`.
+  optional bool all_columns_as_keys = 3;
+
+  // (Optional) Deduplicate within the time range of watermark.
+  optional bool within_watermark = 4;
+}
+
+// A relation that does not need to be qualified by name.
+message LocalRelation {
+  // (Optional) Local collection data serialized into Arrow IPC streaming format which contains
+  // the schema of the data.
+  optional bytes data = 1;
+
+  // (Optional) The schema of local data.
+  // It should be either a DDL-formatted type string or a JSON string.
+  //
+  // The server side will update the column names and data types according to this schema.
+  // If the 'data' is not provided, then this schema will be required.
+  optional string schema = 2;
+}
+
+// A local relation that has been cached already.
+message CachedLocalRelation {
+  // `userId` and `sessionId` fields are deleted since the server must always use the active
+  // session/user rather than arbitrary values provided by the client. It is never valid to access
+  // a local relation from a different session/user.
+  reserved 1, 2;
+  reserved "userId", "sessionId";
+
+  // (Required) A sha-256 hash of the serialized local relation in proto, see LocalRelation.
+  string hash = 3;
+}
+
+// Represents a remote relation that has been cached on server.
+message CachedRemoteRelation {
+  // (Required) ID of the remote related (assigned by the service).
+  string relation_id = 1;
+}
+
+// Relation of type [[Sample]] that samples a fraction of the dataset.
+message Sample {
+  // (Required) Input relation for a Sample.
+  Relation input = 1;
+
+  // (Required) lower bound.
+  double lower_bound = 2;
+
+  // (Required) upper bound.
+  double upper_bound = 3;
+
+  // (Optional) Whether to sample with replacement.
+  optional bool with_replacement = 4;
+
+  // (Optional) The random seed.
+  optional int64 seed = 5;
+
+  // (Required) Explicitly sort the underlying plan to make the ordering deterministic or cache it.
+  // This flag is true when invoking `dataframe.randomSplit` to randomly splits DataFrame with the
+  // provided weights. Otherwise, it is false.
+  bool deterministic_order = 6;
+}
+
+// Relation of type [[Range]] that generates a sequence of integers.
+message Range {
+  // (Optional) Default value = 0
+  optional int64 start = 1;
+
+  // (Required)
+  int64 end = 2;
+
+  // (Required)
+  int64 step = 3;
+
+  // Optional. Default value is assigned by 1) SQL conf "spark.sql.leafNodeDefaultParallelism" if
+  // it is set, or 2) spark default parallelism.
+  optional int32 num_partitions = 4;
+}
+
+// Relation alias.
+message SubqueryAlias {
+  // (Required) The input relation of SubqueryAlias.
+  Relation input = 1;
+
+  // (Required) The alias.
+  string alias = 2;
+
+  // (Optional) Qualifier of the alias.
+  repeated string qualifier = 3;
+}
+
+// Relation repartition.
+message Repartition {
+  // (Required) The input relation of Repartition.
+  Relation input = 1;
+
+  // (Required) Must be positive.
+  int32 num_partitions = 2;
+
+  // (Optional) Default value is false.
+  optional bool shuffle = 3;
+}
+
+// Compose the string representing rows for output.
+// It will invoke 'Dataset.showString' to compute the results.
+message ShowString {
+  // (Required) The input relation.
+  Relation input = 1;
+
+  // (Required) Number of rows to show.
+  int32 num_rows = 2;
+
+  // (Required) If set to more than 0, truncates strings to
+  // `truncate` characters and all cells will be aligned right.
+  int32 truncate = 3;
+
+  // (Required) If set to true, prints output rows vertically (one line per column value).
+  bool vertical = 4;
+}
+
+// Compose the string representing rows for output.
+// It will invoke 'Dataset.htmlString' to compute the results.
+message HtmlString {
+  // (Required) The input relation.
+  Relation input = 1;
+
+  // (Required) Number of rows to show.
+  int32 num_rows = 2;
+
+  // (Required) If set to more than 0, truncates strings to
+  // `truncate` characters and all cells will be aligned right.
+  int32 truncate = 3;
+}
+
+// Computes specified statistics for numeric and string columns.
+// It will invoke 'Dataset.summary' (same as 'StatFunctions.summary')
+// to compute the results.
+message StatSummary {
+  // (Required) The input relation.
+  Relation input = 1;
+
+  // (Optional) Statistics from to be computed.
+  //
+  // Available statistics are:
+  //  count
+  //  mean
+  //  stddev
+  //  min
+  //  max
+  //  arbitrary approximate percentiles specified as a percentage (e.g. 75%)
+  //  count_distinct
+  //  approx_count_distinct
+  //
+  // If no statistics are given, this function computes 'count', 'mean', 'stddev', 'min',
+  // 'approximate quartiles' (percentiles at 25%, 50%, and 75%), and 'max'.
+  repeated string statistics = 2;
+}
+
+// Computes basic statistics for numeric and string columns, including count, mean, stddev, min,
+// and max. If no columns are given, this function computes statistics for all numerical or
+// string columns.
+message StatDescribe {
+  // (Required) The input relation.
+  Relation input = 1;
+
+  // (Optional) Columns to compute statistics on.
+  repeated string cols = 2;
+}
+
+// Computes a pair-wise frequency table of the given columns. Also known as a contingency table.
+// It will invoke 'Dataset.stat.crosstab' (same as 'StatFunctions.crossTabulate')
+// to compute the results.
+message StatCrosstab {
+  // (Required) The input relation.
+  Relation input = 1;
+
+  // (Required) The name of the first column.
+  //
+  // Distinct items will make the first item of each row.
+  string col1 = 2;
+
+  // (Required) The name of the second column.
+  //
+  // Distinct items will make the column names of the DataFrame.
+  string col2 = 3;
+}
+
+// Calculate the sample covariance of two numerical columns of a DataFrame.
+// It will invoke 'Dataset.stat.cov' (same as 'StatFunctions.calculateCov') to compute the results.
+message StatCov {
+  // (Required) The input relation.
+  Relation input = 1;
+
+  // (Required) The name of the first column.
+  string col1 = 2;
+
+  // (Required) The name of the second column.
+  string col2 = 3;
+}
+
+// Calculates the correlation of two columns of a DataFrame. Currently only supports the Pearson
+// Correlation Coefficient. It will invoke 'Dataset.stat.corr' (same as
+// 'StatFunctions.pearsonCorrelation') to compute the results.
+message StatCorr {
+  // (Required) The input relation.
+  Relation input = 1;
+
+  // (Required) The name of the first column.
+  string col1 = 2;
+
+  // (Required) The name of the second column.
+  string col2 = 3;
+
+  // (Optional) Default value is 'pearson'.
+  //
+  // Currently only supports the Pearson Correlation Coefficient.
+  optional string method = 4;
+}
+
+// Calculates the approximate quantiles of numerical columns of a DataFrame.
+// It will invoke 'Dataset.stat.approxQuantile' (same as 'StatFunctions.approxQuantile')
+// to compute the results.
+message StatApproxQuantile {
+  // (Required) The input relation.
+  Relation input = 1;
+
+  // (Required) The names of the numerical columns.
+  repeated string cols = 2;
+
+  // (Required) A list of quantile probabilities.
+  //
+  // Each number must belong to [0, 1].
+  // For example 0 is the minimum, 0.5 is the median, 1 is the maximum.
+  repeated double probabilities = 3;
+
+  // (Required) The relative target precision to achieve (greater than or equal to 0).
+  //
+  // If set to zero, the exact quantiles are computed, which could be very expensive.
+  // Note that values greater than 1 are accepted but give the same result as 1.
+  double relative_error = 4;
+}
+
+// Finding frequent items for columns, possibly with false positives.
+// It will invoke 'Dataset.stat.freqItems' (same as 'StatFunctions.freqItems')
+// to compute the results.
+message StatFreqItems {
+  // (Required) The input relation.
+  Relation input = 1;
+
+  // (Required) The names of the columns to search frequent items in.
+  repeated string cols = 2;
+
+  // (Optional) The minimum frequency for an item to be considered `frequent`.
+  // Should be greater than 1e-4.
+  optional double support = 3;
+}
+
+
+// Returns a stratified sample without replacement based on the fraction
+// given on each stratum.
+// It will invoke 'Dataset.stat.freqItems' (same as 'StatFunctions.freqItems')
+// to compute the results.
+message StatSampleBy {
+  // (Required) The input relation.
+  Relation input = 1;
+
+  // (Required) The column that defines strata.
+  Expression col = 2;
+
+  // (Required) Sampling fraction for each stratum.
+  //
+  // If a stratum is not specified, we treat its fraction as zero.
+  repeated Fraction fractions = 3;
+
+  // (Optional) The random seed.
+  optional int64 seed = 5;
+
+  message Fraction {
+    // (Required) The stratum.
+    Expression.Literal stratum = 1;
+
+    // (Required) The fraction value. Must be in [0, 1].
+    double fraction = 2;
+  }
+}
+
+
+// Replaces null values.
+// It will invoke 'Dataset.na.fill' (same as 'DataFrameNaFunctions.fill') to compute the results.
+// Following 3 parameter combinations are supported:
+//  1, 'values' only contains 1 item, 'cols' is empty:
+//    replaces null values in all type-compatible columns.
+//  2, 'values' only contains 1 item, 'cols' is not empty:
+//    replaces null values in specified columns.
+//  3, 'values' contains more than 1 items, then 'cols' is required to have the same length:
+//    replaces each specified column with corresponding value.
+message NAFill {
+  // (Required) The input relation.
+  Relation input = 1;
+
+  // (Optional) Optional list of column names to consider.
+  repeated string cols = 2;
+
+  // (Required) Values to replace null values with.
+  //
+  // Should contain at least 1 item.
+  // Only 4 data types are supported now: bool, long, double, string
+  repeated Expression.Literal values = 3;
+}
+
+
+// Drop rows containing null values.
+// It will invoke 'Dataset.na.drop' (same as 'DataFrameNaFunctions.drop') to compute the results.
+message NADrop {
+  // (Required) The input relation.
+  Relation input = 1;
+
+  // (Optional) Optional list of column names to consider.
+  //
+  // When it is empty, all the columns in the input relation will be considered.
+  repeated string cols = 2;
+
+  // (Optional) The minimum number of non-null and non-NaN values required to keep.
+  //
+  // When not set, it is equivalent to the number of considered columns, which means
+  // a row will be kept only if all columns are non-null.
+  //
+  // 'how' options ('all', 'any') can be easily converted to this field:
+  //   - 'all' -> set 'min_non_nulls' 1;
+  //   - 'any' -> keep 'min_non_nulls' unset;
+  optional int32 min_non_nulls = 3;
+}
+
+
+// Replaces old values with the corresponding values.
+// It will invoke 'Dataset.na.replace' (same as 'DataFrameNaFunctions.replace')
+// to compute the results.
+message NAReplace {
+  // (Required) The input relation.
+  Relation input = 1;
+
+  // (Optional) List of column names to consider.
+  //
+  // When it is empty, all the type-compatible columns in the input relation will be considered.
+  repeated string cols = 2;
+
+  // (Optional) The value replacement mapping.
+  repeated Replacement replacements = 3;
+
+  message Replacement {
+    // (Required) The old value.
+    //
+    // Only 4 data types are supported now: null, bool, double, string.
+    Expression.Literal old_value = 1;
+
+    // (Required) The new value.
+    //
+    // Should be of the same data type with the old value.
+    Expression.Literal new_value = 2;
+  }
+}
+
+
+// Rename columns on the input relation by the same length of names.
+message ToDF {
+  // (Required) The input relation of RenameColumnsBySameLengthNames.
+  Relation input = 1;
+
+  // (Required)
+  //
+  // The number of columns of the input relation must be equal to the length
+  // of this field. If this is not true, an exception will be returned.
+  repeated string column_names = 2;
+}
+
+
+// Rename columns on the input relation by a map with name to name mapping.
+message WithColumnsRenamed {
+  // (Required) The input relation.
+  Relation input = 1;
+
+
+  // (Required)
+  //
+  // Renaming column names of input relation from A to B where A is the map key
+  // and B is the map value. This is a no-op if schema doesn't contain any A. It
+  // does not require that all input relation column names to present as keys.
+  // duplicated B are not allowed.
+  map<string, string> rename_columns_map = 2;
+}
+
+// Adding columns or replacing the existing columns that have the same names.
+message WithColumns {
+  // (Required) The input relation.
+  Relation input = 1;
+
+  // (Required)
+  //
+  // Given a column name, apply the corresponding expression on the column. If column
+  // name exists in the input relation, then replace the column. If the column name
+  // does not exist in the input relation, then adds it as a new column.
+  //
+  // Only one name part is expected from each Expression.Alias.
+  //
+  // An exception is thrown when duplicated names are present in the mapping.
+  repeated Expression.Alias aliases = 2;
+}
+
+message WithWatermark {
+
+  // (Required) The input relation
+  Relation input = 1;
+
+  // (Required) Name of the column containing event time.
+  string event_time = 2;
+
+  // (Required)
+  string delay_threshold = 3;
+}
+
+// Specify a hint over a relation. Hint should have a name and optional parameters.
+message Hint {
+  // (Required) The input relation.
+  Relation input = 1;
+
+  // (Required) Hint name.
+  //
+  // Supported Join hints include BROADCAST, MERGE, SHUFFLE_HASH, SHUFFLE_REPLICATE_NL.
+  //
+  // Supported partitioning hints include COALESCE, REPARTITION, REPARTITION_BY_RANGE.
+  string name = 2;
+
+  // (Optional) Hint parameters.
+  repeated Expression parameters = 3;
+}
+
+// Unpivot a DataFrame from wide format to long format, optionally leaving identifier columns set.
+message Unpivot {
+  // (Required) The input relation.
+  Relation input = 1;
+
+  // (Required) Id columns.
+  repeated Expression ids = 2;
+
+  // (Optional) Value columns to unpivot.
+  optional Values values = 3;
+
+  // (Required) Name of the variable column.
+  string variable_column_name = 4;
+
+  // (Required) Name of the value column.
+  string value_column_name = 5;
+
+  message Values {
+    repeated Expression values = 1;
+  }
+}
+
+message ToSchema {
+  // (Required) The input relation.
+  Relation input = 1;
+
+  // (Required) The user provided schema.
+  //
+  // The Sever side will update the dataframe with this schema.
+  DataType schema = 2;
+}
+
+message RepartitionByExpression {
+  // (Required) The input relation.
+  Relation input = 1;
+
+  // (Required) The partitioning expressions.
+  repeated Expression partition_exprs = 2;
+
+  // (Optional) number of partitions, must be positive.
+  optional int32 num_partitions = 3;
+}
+
+message MapPartitions {
+  // (Required) Input relation for a mapPartitions-equivalent API: mapInPandas, mapInArrow.
+  Relation input = 1;
+
+  // (Required) Input user-defined function.
+  CommonInlineUserDefinedFunction func = 2;
+
+  // (Optional) Whether to use barrier mode execution or not.
+  optional bool is_barrier = 3;
+}
+
+message GroupMap {
+  // (Required) Input relation for Group Map API: apply, applyInPandas.
+  Relation input = 1;
+
+  // (Required) Expressions for grouping keys.
+  repeated Expression grouping_expressions = 2;
+
+  // (Required) Input user-defined function.
+  CommonInlineUserDefinedFunction func = 3;
+
+  // (Optional) Expressions for sorting. Only used by Scala Sorted Group Map API.
+  repeated Expression sorting_expressions = 4;
+
+  // Below fields are only used by (Flat)MapGroupsWithState
+  // (Optional) Input relation for initial State.
+  Relation initial_input = 5;
+
+  // (Optional) Expressions for grouping keys of the initial state input relation.
+  repeated Expression initial_grouping_expressions = 6;
+
+  // (Optional) True if MapGroupsWithState, false if FlatMapGroupsWithState.
+  optional bool is_map_groups_with_state = 7;
+
+  // (Optional) The output mode of the function.
+  optional string output_mode = 8;
+
+  // (Optional) Timeout configuration for groups that do not receive data for a while.
+  optional string timeout_conf = 9;
+}
+
+message CoGroupMap {
+  // (Required) One input relation for CoGroup Map API - applyInPandas.
+  Relation input = 1;
+
+  // Expressions for grouping keys of the first input relation.
+  repeated Expression input_grouping_expressions = 2;
+
+  // (Required) The other input relation.
+  Relation other = 3;
+
+  // Expressions for grouping keys of the other input relation.
+  repeated Expression other_grouping_expressions = 4;
+
+  // (Required) Input user-defined function.
+  CommonInlineUserDefinedFunction func = 5;
+
+  // (Optional) Expressions for sorting. Only used by Scala Sorted CoGroup Map API.
+  repeated Expression input_sorting_expressions = 6;
+
+  // (Optional) Expressions for sorting. Only used by Scala Sorted CoGroup Map API.
+  repeated Expression other_sorting_expressions = 7;
+}
+
+message ApplyInPandasWithState {
+  // (Required) Input relation for applyInPandasWithState.
+  Relation input = 1;
+
+  // (Required) Expressions for grouping keys.
+  repeated Expression grouping_expressions = 2;
+
+  // (Required) Input user-defined function.
+  CommonInlineUserDefinedFunction func = 3;
+
+  // (Required) Schema for the output DataFrame.
+  string output_schema = 4;
+
+  // (Required) Schema for the state.
+  string state_schema = 5;
+
+  // (Required) The output mode of the function.
+  string output_mode = 6;
+
+  // (Required) Timeout configuration for groups that do not receive data for a while.
+  string timeout_conf = 7;
+}
+
+message CommonInlineUserDefinedTableFunction {
+  // (Required) Name of the user-defined table function.
+  string function_name = 1;
+
+  // (Optional) Whether the user-defined table function is deterministic.
+  bool deterministic = 2;
+
+  // (Optional) Function input arguments. Empty arguments are allowed.
+  repeated Expression arguments = 3;
+
+  // (Required) Type of the user-defined table function.
+  oneof function {
+    PythonUDTF python_udtf = 4;
+  }
+}
+
+message PythonUDTF {
+  // (Optional) Return type of the Python UDTF.
+  optional DataType return_type = 1;
+
+  // (Required) EvalType of the Python UDTF.
+  int32 eval_type = 2;
+
+  // (Required) The encoded commands of the Python UDTF.
+  bytes command = 3;
+
+  // (Required) Python version being used in the client.
+  string python_ver = 4;
+}
+
+// Collect arbitrary (named) metrics from a dataset.
+message CollectMetrics {
+  // (Required) The input relation.
+  Relation input = 1;
+
+  // (Required) Name of the metrics.
+  string name = 2;
+
+  // (Required) The metric sequence.
+  repeated Expression metrics = 3;
+}
+
+message Parse {
+  // (Required) Input relation to Parse. The input is expected to have single text column.
+  Relation input = 1;
+  // (Required) The expected format of the text.
+  ParseFormat format = 2;
+
+  // (Optional) DataType representing the schema. If not set, Spark will infer the schema.
+  optional DataType schema = 3;
+
+  // Options for the csv/json parser. The map key is case insensitive.
+  map<string, string> options = 4;
+  enum ParseFormat {
+    PARSE_FORMAT_UNSPECIFIED = 0;
+    PARSE_FORMAT_CSV = 1;
+    PARSE_FORMAT_JSON = 2;
+  }
+}
diff --git a/velox/functions/sparksql/fuzzer/proto/spark/connect/types.proto b/velox/functions/sparksql/fuzzer/proto/spark/connect/types.proto
new file mode 100644
index 0000000000000..43552381d28f8
--- /dev/null
+++ b/velox/functions/sparksql/fuzzer/proto/spark/connect/types.proto
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = 'proto3';
+
+package spark.connect;
+
+option java_multiple_files = true;
+option java_package = "org.apache.spark.connect.proto";
+option go_package = "internal/generated";
+
+// This message describes the logical [[DataType]] of something. It does not carry the value
+// itself but only describes it.
+message DataType {
+  oneof kind {
+    NULL null = 1;
+
+    Binary binary = 2;
+
+    Boolean boolean = 3;
+
+    // Numeric types
+    Byte byte = 4;
+    Short short = 5;
+    Integer integer = 6;
+    Long long = 7;
+
+    Float float = 8;
+    Double double = 9;
+    Decimal decimal = 10;
+
+    // String types
+    String string = 11;
+    Char char = 12;
+    VarChar var_char = 13;
+
+    // Datatime types
+    Date date = 14;
+    Timestamp timestamp = 15;
+    TimestampNTZ timestamp_ntz = 16;
+
+    // Interval types
+    CalendarInterval calendar_interval = 17;
+    YearMonthInterval year_month_interval = 18;
+    DayTimeInterval day_time_interval = 19;
+
+    // Complex types
+    Array array = 20;
+    Struct struct = 21;
+    Map map = 22;
+
+    // UserDefinedType
+    UDT udt = 23;
+
+    // UnparsedDataType
+    Unparsed unparsed = 24;
+  }
+
+  message Boolean {
+    uint32 type_variation_reference = 1;
+  }
+
+  message Byte {
+    uint32 type_variation_reference = 1;
+  }
+
+  message Short {
+    uint32 type_variation_reference = 1;
+  }
+
+  message Integer {
+    uint32 type_variation_reference = 1;
+  }
+
+  message Long {
+    uint32 type_variation_reference = 1;
+  }
+
+  message Float {
+    uint32 type_variation_reference = 1;
+  }
+
+  message Double {
+    uint32 type_variation_reference = 1;
+  }
+
+  message String {
+    uint32 type_variation_reference = 1;
+  }
+
+  message Binary {
+    uint32 type_variation_reference = 1;
+  }
+
+  message NULL {
+    uint32 type_variation_reference = 1;
+  }
+
+  message Timestamp {
+    uint32 type_variation_reference = 1;
+  }
+
+  message Date {
+    uint32 type_variation_reference = 1;
+  }
+
+  message TimestampNTZ {
+    uint32 type_variation_reference = 1;
+  }
+
+  message CalendarInterval {
+    uint32 type_variation_reference = 1;
+  }
+
+  message YearMonthInterval {
+    optional int32 start_field = 1;
+    optional int32 end_field = 2;
+    uint32 type_variation_reference = 3;
+  }
+
+  message DayTimeInterval {
+    optional int32 start_field = 1;
+    optional int32 end_field = 2;
+    uint32 type_variation_reference = 3;
+  }
+
+  // Start compound types.
+  message Char {
+    int32 length = 1;
+    uint32 type_variation_reference = 2;
+  }
+
+  message VarChar {
+    int32 length = 1;
+    uint32 type_variation_reference = 2;
+  }
+
+  message Decimal {
+    optional int32 scale = 1;
+    optional int32 precision = 2;
+    uint32 type_variation_reference = 3;
+  }
+
+  message StructField {
+    string name = 1;
+    DataType data_type = 2;
+    bool nullable = 3;
+    optional string metadata = 4;
+  }
+
+  message Struct {
+    repeated StructField fields = 1;
+    uint32 type_variation_reference = 2;
+  }
+
+  message Array {
+    DataType element_type = 1;
+    bool contains_null = 2;
+    uint32 type_variation_reference = 3;
+  }
+
+  message Map {
+    DataType key_type = 1;
+    DataType value_type = 2;
+    bool value_contains_null = 3;
+    uint32 type_variation_reference = 4;
+  }
+
+  message UDT {
+    string type = 1;
+    optional string jvm_class = 2;
+    optional string python_class = 3;
+    optional string serialized_python_class = 4;
+    DataType sql_type = 5;
+  }
+
+  message Unparsed {
+    // (Required) The unparsed data type string
+    string data_type_string = 1;
+  }
+}