diff --git a/.circleci/config.yml b/.circleci/config.yml
index 2391d810f88d..e1b67e1502f2 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -19,6 +19,7 @@ workflows:
   dist-compile:
     jobs:
       - linux-build
+      - linux-pr-fuzzer-run
       - linux-build-options
       - linux-adapters
       - macos-build:
@@ -161,7 +162,7 @@ commands:
       - run:
           name: "Run << parameters.fuzzer_name >> Fuzzer"
           command: |
-            eval " << parameters.fuzzer_exe >> << parameters.fuzzer_args >> " \
+            eval ' << parameters.fuzzer_exe >> << parameters.fuzzer_args >> ' \
                   2>&1 | tee "<< parameters.fuzzer_output >>" || ( \
                     tail -n 1000 "<< parameters.fuzzer_output >>" ; \
                     echo "FAIL: << parameters.fuzzer_name >> run failed"; \
@@ -357,34 +358,6 @@ jobs:
          name: "Run Example Binaries"
          command: |
            find _build/debug/velox/examples/ -maxdepth 1 -type f -executable -exec "{}" \;
-      - run:
-          name: "Build and Test PyVelox"
-          command: |
-            conda init bash
-            source ~/.bashrc
-            conda create -y --name pyveloxenv python=3.7
-            conda activate pyveloxenv
-            LD_LIBRARY_PATH=/usr/local/lib make python-test
-      - run:
-          name: "Check function signatures"
-          command: |
-            source ~/.bashrc
-            conda activate pyveloxenv
-            pip install deepdiff
-            python ./scripts/signature.py export --spark --presto /tmp/pr_signatures.json
-            cp ./scripts/signature.py /tmp/signature.py
-            git remote add upstream https://github.com/facebookincubator/velox
-            git fetch upstream
-            merge_base=$(git merge-base  'upstream/main' `git rev-parse HEAD`) || \
-            { echo "::error::Failed to find merge_base"; exit 1; }
-            echo "Merge Base: $merge_base"
-            git checkout $merge_base
-            git submodule update --init --recursive
-            LD_LIBRARY_PATH=/usr/local/lib make python-clean
-            LD_LIBRARY_PATH=/usr/local/lib make python-build
-            cp /tmp/signature.py ./scripts/signature.py
-            python ./scripts/signature.py export --spark --presto /tmp/main_signatures.json
-            python ./scripts/signature.py diff /tmp/main_signatures.json /tmp/pr_signatures.json
       - post-steps
 
   linux-build-release:
@@ -671,3 +644,78 @@ jobs:
             git config --global user.name "velox"
             cd presto/presto-native-execution
             make runtime-container
+
+  linux-pr-fuzzer-run:
+    executor: build
+    steps:
+      - pre-steps
+      - run:
+          name: "Get merge base function signatures"
+          command: |
+            source ~/.bashrc
+            conda create -y --name pyveloxenv python=3.7
+            conda activate pyveloxenv
+            cp ./scripts/signature.py /tmp/signature.py
+            pip install deepdiff
+            git remote add upstream https://github.com/facebookincubator/velox
+            git fetch upstream
+            merge_base=$(git merge-base  'upstream/main' `git rev-parse HEAD`) || \
+            { echo "::error::Failed to find merge_base"; exit 1; }
+            echo "Merge Base: $merge_base"
+            git checkout $merge_base
+            git submodule update --init --recursive
+            LD_LIBRARY_PATH=/usr/local/lib make python-clean
+            LD_LIBRARY_PATH=/usr/local/lib make python-build
+            python /tmp/signature.py export --spark spark_merge_base_signatures.json
+            python /tmp/signature.py export --presto presto_merge_base_signatures.json
+      - checkout
+      - run:
+          name: "Build"
+          command: |
+              make debug NUM_THREADS=16 MAX_HIGH_MEM_JOBS=8 MAX_LINK_JOBS=6 EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_ARROW=ON"
+              ccache -s
+          no_output_timeout: 1h
+      - run:
+          name: "Build and test PyVelox"
+          command: |
+            conda init bash
+            source ~/.bashrc
+            conda activate pyveloxenv
+            LD_LIBRARY_PATH=/usr/local/lib make python-test
+      - run:
+          name: "Check and create bias function signatures"
+          command: |
+            source ~/.bashrc
+            conda activate pyveloxenv
+            pip install deepdiff
+            python ./scripts/signature.py export --presto presto_pr_signatures.json
+            python ./scripts/signature.py export --spark spark_pr_signatures.json
+            python ./scripts/signature.py bias presto_merge_base_signatures.json presto_pr_signatures.json /tmp/presto_bias_functions 
+            python ./scripts/signature.py bias spark_merge_base_signatures.json spark_pr_signatures.json /tmp/spark_bias_functions 
+
+      - fuzzer-run:
+          fuzzer_output: "/tmp/fuzzer.log"
+          fuzzer_repro: "/tmp/fuzzer_repro"
+          fuzzer_name: "Expression Bias Run"
+          fuzzer_exe: "if [ -f /tmp/presto_bias_functions ]; then _build/debug/velox/expression/tests/velox_expression_fuzzer_test"
+          fuzzer_args: " --seed ${RANDOM} --lazy_vector_generation_ratio 0.2 \
+          --assign_function_tickets  $(cat /tmp/presto_bias_functions) \
+          --duration_sec 3600 --enable_variadic_signatures \
+          --velox_fuzzer_enable_complex_types \
+          --velox_fuzzer_enable_column_reuse \
+          --velox_fuzzer_enable_expression_reuse \
+          --max_expression_trees_per_step 2 \
+          --retry_with_try \
+          --enable_dereference \
+          --logtostderr=1 --minloglevel=0 \
+          --repro_persist_path=/tmp/fuzzer_repro ; fi"
+
+      - fuzzer-run:
+          fuzzer_output: "/tmp/spark_fuzzer.log"
+          fuzzer_repro: "/tmp/spark_fuzzer_repro"
+          fuzzer_name: "Spark Bias Run"
+          fuzzer_exe: "if [ -f /tmp/spark_bias_functions ];  then _build/debug/velox/expression/tests/spark_expression_fuzzer_test"
+          fuzzer_args: " --seed ${RANDOM} --duration_sec 3600 --logtostderr=1 --minloglevel=0 \
+          --assign_function_tickets  $(cat /tmp/spark_bias_functions) \
+          --repro_persist_path=/tmp/spark_fuzzer_repro ; fi"
+
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index c1ddd372baef..6a140b12c1b3 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -193,8 +193,7 @@ jobs:
           ./scripts/benchmark-runner.py compare \
                 --baseline_path ${BASELINE_OUTPUT_PATH} \
                 --contender_path ${CONTENDER_OUTPUT_PATH} \
-                --recursive \
-                --do_not_fail
+                --recursive 
           echo "::endgroup::"
 
       - name: "Save PR number"
diff --git a/build/deps/github_hashes/facebook/folly-rev.txt b/build/deps/github_hashes/facebook/folly-rev.txt
index fc9daf778045..8090f8779ccc 100644
--- a/build/deps/github_hashes/facebook/folly-rev.txt
+++ b/build/deps/github_hashes/facebook/folly-rev.txt
@@ -1 +1 @@
-Subproject commit dfeb9e3b20b41ba776d2789e035c1b36c96faa75
+Subproject commit d0254f0af28be32985a43159c3dd8156892f140c
diff --git a/build/fbcode_builder/CMake/FBPythonBinary.cmake b/build/fbcode_builder/CMake/FBPythonBinary.cmake
index 99c33fb8c953..f91ebaf32645 100644
--- a/build/fbcode_builder/CMake/FBPythonBinary.cmake
+++ b/build/fbcode_builder/CMake/FBPythonBinary.cmake
@@ -32,7 +32,7 @@ if(NOT TARGET Python3::Interpreter)
   # We find with QUIET here, since otherwise this generates some noisy warnings
   # on versions of CMake before 3.12
   if (WIN32)
-    # On Windows we need both the Intepreter as well as the Development
+    # On Windows we need both the Interpreter as well as the Development
     # libraries.
     find_package(Python3 COMPONENTS Interpreter Development QUIET)
   else()
@@ -487,7 +487,7 @@ function(add_fb_python_library LIB_NAME)
     #   won't complain if one of the dependencies doesn't exist (since it is
     #   intended to allow passing in file names for plain library files rather
     #   than just targets).
-    # - It ensures that sources for our depencencies are built before any
+    # - It ensures that sources for our dependencies are built before any
     #   executable that depends on us.  Note that we depend on "${dep}.py_lib"
     #   rather than "${dep}.py_sources_built" for this purpose because the
     #   ".py_sources_built" target won't be available for imported targets.
diff --git a/build/fbcode_builder/CMake/fb_py_test_main.py b/build/fbcode_builder/CMake/fb_py_test_main.py
index e9ae5dd028a6..41626181b1ec 100644
--- a/build/fbcode_builder/CMake/fb_py_test_main.py
+++ b/build/fbcode_builder/CMake/fb_py_test_main.py
@@ -262,7 +262,7 @@ def stopTest(self, test):
 
         super(BuckTestResult, self).stopTest(test)
 
-        # If a failure occured during module/class setup, then this "test" may
+        # If a failure occurred during module/class setup, then this "test" may
         # actually be a `_ErrorHolder`, which doesn't contain explicit info
         # about the upcoming test.  Since we really only care about the test
         # name field (i.e. `_testMethodName`), we use that to detect an actual
diff --git a/build/fbcode_builder/getdeps.py b/build/fbcode_builder/getdeps.py
index 565ef99135e7..9358c425e4aa 100755
--- a/build/fbcode_builder/getdeps.py
+++ b/build/fbcode_builder/getdeps.py
@@ -626,7 +626,7 @@ def run_project_cmd(self, args, loader, manifest):
                     )
                     builder.build(install_dirs, reconfigure=reconfigure)
 
-                    # If we are building the project (not depdendency) and a specific
+                    # If we are building the project (not dependency) and a specific
                     # cmake_target (not 'install') has been requested, then we don't
                     # set the built_marker. This allows subsequent runs of getdeps.py
                     # for the project to run with different cmake_targets to trigger
diff --git a/build/fbcode_builder/getdeps/builder.py b/build/fbcode_builder/getdeps/builder.py
index 4f0c809092f2..aa1b0f99601c 100644
--- a/build/fbcode_builder/getdeps/builder.py
+++ b/build/fbcode_builder/getdeps/builder.py
@@ -346,7 +346,7 @@ def _build(self, install_dirs, reconfigure) -> None:
 
 class Iproute2Builder(BuilderBase):
     # ./configure --prefix does not work for iproute2.
-    # Thus, explicitly copy sources from src_dir to build_dir, bulid,
+    # Thus, explicitly copy sources from src_dir to build_dir, build,
     # and then install to inst_dir using DESTDIR
     # lastly, also copy include from build_dir to inst_dir
     def __init__(self, build_opts, ctx, manifest, src_dir, build_dir, inst_dir) -> None:
diff --git a/build/fbcode_builder/getdeps/cargo.py b/build/fbcode_builder/getdeps/cargo.py
index 64a4e577b33e..09e00a39cf98 100644
--- a/build/fbcode_builder/getdeps/cargo.py
+++ b/build/fbcode_builder/getdeps/cargo.py
@@ -194,7 +194,7 @@ def _patchup_workspace(self, dep_to_git) -> None:
             my-rename-of-crate = { package = "crate", git = "..." }
 
         they can count themselves lucky because the code will raise an
-        Exception. There migh be more cases where the code will silently pass
+        Exception. There might be more cases where the code will silently pass
         producing bad results.
         """
         workspace_dir = self.workspace_dir()
@@ -362,7 +362,7 @@ def _resolve_dep_to_crates(self, build_source_dir, dep_to_git):
 
         dep_to_crates = {}
 
-        # First populate explicit crate paths from depedencies
+        # First populate explicit crate paths from dependencies
         for name, git_conf in dep_to_git.items():
             crates = git_conf["crate_source_map"].keys()
             if crates:
diff --git a/build/fbcode_builder/getdeps/envfuncs.py b/build/fbcode_builder/getdeps/envfuncs.py
index 6072a69ec4db..60de6b23143e 100644
--- a/build/fbcode_builder/getdeps/envfuncs.py
+++ b/build/fbcode_builder/getdeps/envfuncs.py
@@ -32,7 +32,7 @@ def _key(self, key):
         # project uses `unicode_literals`.  `subprocess` will raise an error
         # if the environment that it is passed has a mixture of byte and
         # unicode strings.
-        # It is simplest to force everthing to be `str` for the sake of
+        # It is simplest to force everything to be `str` for the sake of
         # consistency.
         key = str(key)
         if sys.platform.startswith("win"):
diff --git a/build/fbcode_builder/manifests/lz4 b/build/fbcode_builder/manifests/lz4
index 2ce1ca9fd1ec..084d6a4aecd8 100644
--- a/build/fbcode_builder/manifests/lz4
+++ b/build/fbcode_builder/manifests/lz4
@@ -6,8 +6,8 @@ lz4
 
 [rpms]
 lz4-devel
-# centos (not centos_stream that is Meta internal) 8 is missing this
-[rpms.not(all(distro=centos,distro_vers=8))]
+# centos 8 and centos_stream 9 are missing this rpm
+[rpms.not(any(all(distro=centos,distro_vers=8),all(distro=centos_stream,distro_vers=9)))]
 lz4-static
 
 [debs]
diff --git a/scripts/setup-macos.sh b/scripts/setup-macos.sh
index a5b1ccf794fb..83d8990603ea 100755
--- a/scripts/setup-macos.sh
+++ b/scripts/setup-macos.sh
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# This script documents setting up a macOS host for presto_cpp
+# This script documents setting up a macOS host for Velox
 # development.  Running it should make you ready to compile.
 #
 # Environment variables:
diff --git a/scripts/signature.py b/scripts/signature.py
index 32d2bf361233..95a51b11371b 100644
--- a/scripts/signature.py
+++ b/scripts/signature.py
@@ -34,10 +34,10 @@ class bcolors:
 def export(args):
     """Exports Velox function signatures."""
     if args.spark:
-        pv.register_spark_signatures("spark_")
+        pv.register_spark_signatures()
 
     if args.presto:
-        pv.register_presto_signatures("presto_")
+        pv.register_presto_signatures()
 
     signatures = pv.get_function_signatures()
 
@@ -51,12 +51,15 @@ def export(args):
     return 0
 
 
-def diff(args):
-    """Diffs Velox function signatures."""
-    first_signatures = json.load(args.first)
-    second_signatures = json.load(args.second)
+def diff_signatures(base_signatures, contender_signatures):
+    """Diffs Velox function signatures. Returns a tuple of the delta diff and exit status"""
+
     delta = DeepDiff(
-        first_signatures, second_signatures, ignore_order=True, report_repetition=True
+        base_signatures,
+        contender_signatures,
+        ignore_order=True,
+        report_repetition=True,
+        view="tree",
     )
     exit_status = 0
     if delta:
@@ -93,10 +96,69 @@ def diff(args):
         """
         )
 
-    return exit_status
+    return delta, exit_status
+
+
+def diff(args):
+    """Diffs Velox function signatures."""
+    base_signatures = json.load(args.base)
+    contender_signatures = json.load(args.contender)
+    return diff_signatures(base_signatures, contender_signatures)[1]
+
+
+def bias(args):
+    base_signatures = json.load(args.base)
+    contender_signatures = json.load(args.contender)
+    tickets = args.ticket_value
+    bias_output, status = bias_signatures(
+        base_signatures, contender_signatures, tickets
+    )
+    if status:
+        return status
+
+    if bias_output:
+        with open(args.output_path, "w") as f:
+            print(f"{bias_output}", file=f, end="")
+
+    return 0
 
 
-def parse_args():
+def bias_signatures(base_signatures, contender_signatures, tickets):
+    """Returns newly added functions as string and a status flag.
+    Newly added functions are biased like so `fn_name1=<ticket_count>,fn_name2=<ticket_count>`.
+    If it detects incompatible changes returns 1 in the status and empty string.
+    """
+    delta, status = diff_signatures(base_signatures, contender_signatures)
+
+    # Return if the signature check call flags incompatible changes.
+    if status:
+        return "", status
+
+    if not delta:
+        print(f"{bcolors.BOLD} No changes detected: Nothing to do!")
+        return "", 0
+
+    function_set = set()
+    for items in delta.values():
+        for item in items:
+            function_set.add(item.get_root_key())
+
+    print(f"{bcolors.BOLD}Functions to be biased: {function_set}")
+
+    if function_set:
+        return f"{f'={tickets},'.join(sorted(function_set)) + f'={tickets}'}", 0
+
+    return "", 0
+
+
+def get_tickets(val):
+    tickets = int(val)
+    if tickets < 0:
+        raise argparse.ArgumentTypeError("Cant have negative values!")
+    return tickets
+
+
+def parse_args(args):
     global parser
 
     parser = argparse.ArgumentParser(
@@ -111,16 +173,23 @@ def parse_args():
     export_command_parser.add_argument("output_file", type=argparse.FileType("w"))
 
     diff_command_parser = command.add_parser("diff")
-    diff_command_parser.add_argument("first", type=argparse.FileType("r"))
-    diff_command_parser.add_argument("second", type=argparse.FileType("r"))
-
+    diff_command_parser.add_argument("base", type=argparse.FileType("r"))
+    diff_command_parser.add_argument("contender", type=argparse.FileType("r"))
+
+    bias_command_parser = command.add_parser("bias")
+    bias_command_parser.add_argument("base", type=argparse.FileType("r"))
+    bias_command_parser.add_argument("contender", type=argparse.FileType("r"))
+    bias_command_parser.add_argument("output_path")
+    bias_command_parser.add_argument(
+        "ticket_value", type=get_tickets, default=10, nargs="?"
+    )
     parser.set_defaults(command="help")
 
-    return parser.parse_args()
+    return parser.parse_args(args)
 
 
 def main():
-    args = parse_args()
+    args = parse_args(sys.argv[1:])
     return globals()[args.command](args)
 
 
diff --git a/scripts/tests/test_signature.py b/scripts/tests/test_signature.py
new file mode 100644
index 000000000000..a32c1121c0b7
--- /dev/null
+++ b/scripts/tests/test_signature.py
@@ -0,0 +1,70 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from scripts.signature import bias_signatures
+from pathlib import Path
+import json
+
+
+def read_from_file(file_path):
+    return Path(file_path).read_text()
+
+
+def test_bias(base_signatures, contender_signatures):
+    return bias_signatures(
+        json.loads(base_signatures), json.loads(contender_signatures), 10
+    )
+
+
+class SignatureTest(unittest.TestCase):
+    def test_bias(self):
+        # Remove a signature
+        _, return_value = test_bias(
+            """{"reverse": ["(array(T)) -> array(T)"]}""",
+            """{"reverse": []}""",
+        )
+
+        self.assertEqual(return_value, 1)
+
+        # Add a new signature
+        bias_functions, _ = test_bias(
+            """{"reverse": ["(array(T)) -> array(T)"]}""",
+            """{"reverse": ["(array(T)) -> array(T)"],
+                   "foo": ["(varchar) -> varchar"]}""",
+        )
+
+        self.assertEqual(bias_functions, "foo=10")
+
+        # Modify a signature.
+        bias_functions, _ = test_bias(
+            """{"reverse": ["(array(T)) -> array(T)"]}""",
+            """{"reverse": ["(array(T)) -> array(T)", "(varchar) -> varchar"]}""",
+        )
+
+        self.assertEqual(bias_functions, "reverse=10")
+
+        # Add more than one signature change
+        bias_functions, _ = test_bias(
+            """{"reverse": ["(array(T)) -> array(T)"]}""",
+            """{"reverse": ["(array(T)) -> array(T)"],
+                   "foo": ["(varchar) -> varchar"],
+                   "bar": ["(varchar) -> varchar"]}""",
+        )
+
+        self.assertEqual(bias_functions, "bar=10,foo=10")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/velox/benchmarks/basic/CastBenchmark.cpp b/velox/benchmarks/basic/CastBenchmark.cpp
index 6908f12d1eef..356b2a704e57 100644
--- a/velox/benchmarks/basic/CastBenchmark.cpp
+++ b/velox/benchmarks/basic/CastBenchmark.cpp
@@ -48,10 +48,14 @@ int main(int argc, char** argv) {
           "cast_int",
           vectorMaker.rowVector(
               {"valid", "empty", "nan"}, {validInput, invalidInput, nanInput}))
-      .addExpression("try_invalid_empty_input", "try_cast (empty as int)")
-      .addExpression("try_invalid_nan", "try_cast (nan as int)")
-      .addExpression("try_valid", "try_cast (valid as int)")
-      .addExpression("valid", "cast(valid as int)")
+      .addExpression("try_cast_invalid_empty_input", "try_cast (empty as int) ")
+      .addExpression(
+          "tryexpr_cast_invalid_empty_input", "try (cast (empty as int))")
+      .addExpression("try_cast_invalid_nan", "try_cast (nan as int)")
+      .addExpression("tryexpr_cast_invalid_nan", "try (cast (nan as int))")
+      .addExpression("try_cast_valid", "try_cast (valid as int)")
+      .addExpression("tryexpr_cast_valid", "try (cast (valid as int))")
+      .addExpression("cast_valid", "cast(valid as int)")
       .withIterations(100)
       .disableTesting();
 
diff --git a/velox/benchmarks/tpch/TpchBenchmark.cpp b/velox/benchmarks/tpch/TpchBenchmark.cpp
index 53ba99feb6b6..8890a50a8e1b 100644
--- a/velox/benchmarks/tpch/TpchBenchmark.cpp
+++ b/velox/benchmarks/tpch/TpchBenchmark.cpp
@@ -230,9 +230,10 @@ class TpchBenchmark {
             static_cast<uint64_t>(FLAGS_ssd_checkpoint_interval_gb) << 30);
       }
 
-      auto allocator = std::make_shared<memory::MmapAllocator>(options);
-      allocator_ = std::make_shared<cache::AsyncDataCache>(
-          allocator, memoryBytes, std::move(ssdCache));
+      allocator_ = std::make_shared<memory::MmapAllocator>(options);
+      cache_ =
+          cache::AsyncDataCache::create(allocator_.get(), std::move(ssdCache));
+      cache::AsyncDataCache::setInstance(cache_.get());
       memory::MemoryAllocator::setDefaultInstance(allocator_.get());
     }
     functions::prestosql::registerAllScalarFunctions();
@@ -261,6 +262,10 @@ class TpchBenchmark {
     connector::registerConnector(hiveConnector);
   }
 
+  void shutdown() {
+    cache_->prepareShutdown();
+  }
+
   std::pair<std::unique_ptr<TaskCursor>, std::vector<RowVectorPtr>> run(
       const TpchPlan& tpchPlan) {
     int32_t repeat = 0;
@@ -382,15 +387,13 @@ class TpchBenchmark {
         }
 #endif
 
-        auto cache = dynamic_cast<cache::AsyncDataCache*>(allocator_.get());
-        if (cache) {
-          cache->clear();
+        if (cache_) {
+          cache_->clear();
         }
       }
       if (FLAGS_clear_ssd_cache) {
-        auto cache = dynamic_cast<cache::AsyncDataCache*>(allocator_.get());
-        if (cache) {
-          auto ssdCache = cache->ssdCache();
+        if (cache_) {
+          auto ssdCache = cache_->ssdCache();
           if (ssdCache) {
             ssdCache->clear();
           }
@@ -462,7 +465,7 @@ class TpchBenchmark {
   std::unique_ptr<folly::IOThreadPoolExecutor> ioExecutor_;
   std::unique_ptr<folly::IOThreadPoolExecutor> cacheExecutor_;
   std::shared_ptr<memory::MemoryAllocator> allocator_;
-
+  std::shared_ptr<cache::AsyncDataCache> cache_;
   // Parameter combinations to try. Each element specifies a flag and possible
   // values. All permutations are tried.
   std::vector<ParameterDim> parameters_;
@@ -578,6 +581,7 @@ int tpchBenchmarkMain() {
   } else {
     benchmark.runAllCombinations();
   }
+  benchmark.shutdown();
   queryBuilder.reset();
   return 0;
 }
diff --git a/velox/common/caching/AsyncDataCache.cpp b/velox/common/caching/AsyncDataCache.cpp
index 92d54deebc8b..3e0ff2db0c7b 100644
--- a/velox/common/caching/AsyncDataCache.cpp
+++ b/velox/common/caching/AsyncDataCache.cpp
@@ -32,7 +32,7 @@ AsyncDataCacheEntry::AsyncDataCacheEntry(CacheShard* shard) : shard_(shard) {
 }
 
 AsyncDataCacheEntry::~AsyncDataCacheEntry() {
-  shard_->cache()->freeNonContiguous(data_);
+  shard_->cache()->allocator()->freeNonContiguous(data_);
 }
 
 void AsyncDataCacheEntry::setExclusiveToShared() {
@@ -108,7 +108,7 @@ void AsyncDataCacheEntry::initialize(FileCacheKey key) {
     tinyData_.clear();
     auto sizePages = bits::roundUp(size_, memory::AllocationTraits::kPageSize) /
         memory::AllocationTraits::kPageSize;
-    if (cache->allocateNonContiguous(sizePages, data_)) {
+    if (cache->allocator()->allocateNonContiguous(sizePages, data_)) {
       cache->incrementCachedPages(data().numPages());
     } else {
       // No memory to cover 'this'.
@@ -324,7 +324,7 @@ void CacheShard::removeEntryLocked(AsyncDataCacheEntry* entry) {
     auto numPages = entry->data().numPages();
     if (numPages) {
       cache_->incrementCachedPages(-numPages);
-      cache_->freeNonContiguous(entry->data());
+      cache_->allocator()->freeNonContiguous(entry->data());
     }
   }
 }
@@ -412,7 +412,7 @@ void CacheShard::evict(uint64_t bytesToFree, bool evictAllUnpinned) {
 
 void CacheShard::freeAllocations(std::vector<memory::Allocation>& allocations) {
   for (auto& allocation : allocations) {
-    cache_->freeNonContiguous(allocation);
+    cache_->allocator()->freeNonContiguous(allocation);
   }
   allocations.clear();
 }
@@ -495,8 +495,7 @@ void CacheShard::appendSsdSaveable(std::vector<CachePin>& pins) {
 }
 
 AsyncDataCache::AsyncDataCache(
-    const std::shared_ptr<MemoryAllocator>& allocator,
-    uint64_t /* maxBytes */,
+    memory::MemoryAllocator* allocator,
     std::unique_ptr<SsdCache> ssdCache)
     : allocator_(allocator), ssdCache_(std::move(ssdCache)), cachedPages_(0) {
   for (auto i = 0; i < kNumShards; ++i) {
@@ -504,15 +503,44 @@ AsyncDataCache::AsyncDataCache(
   }
 }
 
-AsyncDataCache::AsyncDataCache(
-    const std::shared_ptr<MemoryAllocator>& allocator,
-    std::unique_ptr<SsdCache> ssdCache)
-    : allocator_(allocator), ssdCache_(std::move(ssdCache)), cachedPages_(0) {
-  for (auto i = 0; i < kNumShards; ++i) {
-    shards_.push_back(std::make_unique<CacheShard>(this));
+AsyncDataCache::~AsyncDataCache() {}
+
+// static
+std::shared_ptr<AsyncDataCache> AsyncDataCache::create(
+    memory::MemoryAllocator* allocator,
+    std::unique_ptr<SsdCache> ssdCache) {
+  auto cache = std::make_shared<AsyncDataCache>(allocator, std::move(ssdCache));
+  allocator->registerCache(cache);
+  return cache;
+}
+
+// static
+AsyncDataCache* AsyncDataCache::getInstance() {
+  return *getInstancePtr();
+}
+
+// static
+void AsyncDataCache::setInstance(AsyncDataCache* asyncDataCache) {
+  *getInstancePtr() = asyncDataCache;
+}
+
+// static
+AsyncDataCache** AsyncDataCache::getInstancePtr() {
+  static AsyncDataCache* cache_{nullptr};
+  return &cache_;
+}
+
+void AsyncDataCache::prepareShutdown() {
+  for (auto& shard : shards_) {
+    shard->prepareShutdown();
   }
 }
 
+void CacheShard::prepareShutdown() {
+  entries_.clear();
+  freeEntries_.clear();
+}
+
 CachePin AsyncDataCache::findOrCreate(
     RawFileCacheKey key,
     uint64_t size,
@@ -611,50 +639,6 @@ void AsyncDataCache::backoff(int32_t counter) {
   std::this_thread::sleep_for(std::chrono::microseconds(usec)); // NOLINT
 }
 
-bool AsyncDataCache::allocateNonContiguous(
-    MachinePageCount numPages,
-    memory::Allocation& out,
-    ReservationCallback reservationCB,
-    MachinePageCount minSizeClass) {
-  return makeSpace(numPages, [&]() {
-    return allocator_->allocateNonContiguous(
-        numPages, out, reservationCB, minSizeClass);
-  });
-}
-
-bool AsyncDataCache::allocateContiguous(
-    memory::MachinePageCount numPages,
-    memory::Allocation* collateral,
-    memory::ContiguousAllocation& allocation,
-    ReservationCallback reservationCB,
-    memory::MachinePageCount maxPages) {
-  return makeSpace(numPages, [&]() {
-    return allocator_->allocateContiguous(
-        numPages, collateral, allocation, reservationCB, maxPages);
-  });
-}
-
-bool AsyncDataCache::growContiguous(
-    MachinePageCount increment,
-    memory::ContiguousAllocation& allocation,
-    ReservationCallback reservationCB) {
-  return makeSpace(increment, [&]() {
-    return allocator_->growContiguous(increment, allocation, reservationCB);
-  });
-}
-
-void* AsyncDataCache::allocateBytes(uint64_t bytes, uint16_t alignment) {
-  void* result = nullptr;
-  makeSpace(
-      bits::roundUp(bytes, memory::AllocationTraits::kPageSize) /
-          memory::AllocationTraits::kPageSize,
-      [&]() {
-        result = allocator_->allocateBytes(bytes, alignment);
-        return result != nullptr;
-      });
-  return result;
-}
-
 void AsyncDataCache::incrementNew(uint64_t size) {
   newBytes_ += size;
   if (!ssdCache_) {
@@ -725,7 +709,7 @@ std::string AsyncDataCache::toString() const {
       << " read pins " << stats.numShared << " write pins "
       << stats.numExclusive << " unused prefetch " << stats.numPrefetch
       << " Alloc Megaclocks " << (stats.allocClocks >> 20)
-      << " allocated pages " << numAllocated() << " cached pages "
+      << " allocated pages " << allocator_->numAllocated() << " cached pages "
       << cachedPages_;
   out << "\nBacking: " << allocator_->toString();
   if (ssdCache_) {
diff --git a/velox/common/caching/AsyncDataCache.h b/velox/common/caching/AsyncDataCache.h
index 2dd8a9098b2f..2a60f8f273f6 100644
--- a/velox/common/caching/AsyncDataCache.h
+++ b/velox/common/caching/AsyncDataCache.h
@@ -506,30 +506,36 @@ struct CacheStats {
 
   std::shared_ptr<SsdCacheStats> ssdStats = nullptr;
 };
-// Collection of cache entries whose key hashes to the same shard of
-// the hash number space.  The cache population is divided into shards
-// to decrease contention on the mutex for the key to entry mapping
-// and other housekeeping.
+
+/// Collection of cache entries whose key hashes to the same shard of
+/// the hash number space.  The cache population is divided into shards
+/// to decrease contention on the mutex for the key to entry mapping
+/// and other housekeeping.
 class CacheShard {
  public:
   explicit CacheShard(AsyncDataCache* FOLLY_NONNULL cache) : cache_(cache) {}
 
-  // See AsyncDataCache::findOrCreate.
+  /// See AsyncDataCache::findOrCreate.
   CachePin findOrCreate(
       RawFileCacheKey key,
       uint64_t size,
       folly::SemiFuture<bool>* readyFuture);
 
-  // Returns true if there is an entry for 'key'. Updates access time.
+  /// Returns true if there is an entry for 'key'. Updates access time.
   bool exists(RawFileCacheKey key) const;
 
-  AsyncDataCache* cache() {
+  AsyncDataCache* cache() const {
     return cache_;
   }
+
   std::mutex& mutex() {
     return mutex_;
   }
 
+  /// Release any resources that consume memory from this 'CacheShard' for a
+  /// graceful shutdown. The shard will no longer be valid after this call.
+  void prepareShutdown();
+
   // removes 'bytesToFree' worth of entries or as many entries as are
   // not pinned. This favors first removing older and less frequently
   // used entries. If 'evictAllUnpinned' is true, anything that is
@@ -609,101 +615,63 @@ class CacheShard {
   std::atomic<uint64_t> allocClocks_;
 };
 
-class AsyncDataCache : public memory::MemoryAllocator {
+class AsyncDataCache : public memory::Cache {
  public:
-  // TODO(jtan6): Remove this constructor after Presto Native switches to below
-  // constructor
   AsyncDataCache(
-      const std::shared_ptr<memory::MemoryAllocator>& allocator,
-      uint64_t maxBytes,
+      memory::MemoryAllocator* allocator,
       std::unique_ptr<SsdCache> ssdCache = nullptr);
 
-  AsyncDataCache(
-      const std::shared_ptr<memory::MemoryAllocator>& allocator,
+  ~AsyncDataCache() override;
+
+  static std::shared_ptr<AsyncDataCache> create(
+      memory::MemoryAllocator* allocator,
       std::unique_ptr<SsdCache> ssdCache = nullptr);
 
-  // Finds or creates a cache entry corresponding to 'key'. The entry
-  // is returned in 'pin'. If the entry is new, it is pinned in
-  // exclusive mode and its 'data_' has uninitialized space for at
-  // least 'size' bytes. If the entry is in cache and already filled,
-  // the pin is in shared mode.  If the entry is in exclusive mode for
-  // some other pin, the pin is empty. If 'waitFuture' is not nullptr
-  // and the pin is exclusive on some other pin, this is set to a
-  // future that is realized when the pin is no longer exclusive. When
-  // the future is realized, the caller may retry findOrCreate().
-  // runtime error with code kNoCacheSpace if there is no space to create the
-  // new entry after evicting any unpinned content.
+  static AsyncDataCache* getInstance();
+
+  static void setInstance(AsyncDataCache* asyncDataCache);
+
+  /// Release any resources that consume memory from 'allocator_' for a graceful
+  /// shutdown. The cache will no longer be valid after this call.
+  void prepareShutdown();
+
+  /// Calls 'allocate' until this returns true. Returns true if
+  /// allocate returns true. and Tries to evict at least 'numPages' of
+  /// cache after each failed call to 'allocate'.  May pause to wait
+  /// for SSD cache flush if ''ssdCache_' is set and is busy
+  /// writing. Does random back-off after several failures and
+  /// eventually gives up. Allocation must not be serialized by a mutex
+  /// for memory arbitration to work.
+  bool makeSpace(
+      memory::MachinePageCount numPages,
+      std::function<bool()> allocate) override;
+
+  memory::MemoryAllocator* allocator() const override {
+    return allocator_;
+  }
+
+  /// Finds or creates a cache entry corresponding to 'key'. The entry
+  /// is returned in 'pin'. If the entry is new, it is pinned in
+  /// exclusive mode and its 'data_' has uninitialized space for at
+  /// least 'size' bytes. If the entry is in cache and already filled,
+  /// the pin is in shared mode.  If the entry is in exclusive mode for
+  /// some other pin, the pin is empty. If 'waitFuture' is not nullptr
+  /// and the pin is exclusive on some other pin, this is set to a
+  /// future that is realized when the pin is no longer exclusive. When
+  /// the future is realized, the caller may retry findOrCreate().
+  /// runtime error with code kNoCacheSpace if there is no space to create the
+  /// new entry after evicting any unpinned content.
   CachePin findOrCreate(
       RawFileCacheKey key,
       uint64_t size,
       folly::SemiFuture<bool>* waitFuture = nullptr);
 
-  // Returns true if there is an entry for 'key'. Updates access time.
+  /// Returns true if there is an entry for 'key'. Updates access time.
   bool exists(RawFileCacheKey key) const;
 
-  Kind kind() const override {
-    return allocator_->kind();
-  }
-
-  size_t capacity() const override {
-    return allocator_->capacity();
-  }
-
-  bool allocateNonContiguous(
-      memory::MachinePageCount numPages,
-      memory::Allocation& out,
-      ReservationCallback reservationCB = nullptr,
-      memory::MachinePageCount minSizeClass = 0) override;
-
-  int64_t freeNonContiguous(memory::Allocation& allocation) override {
-    return allocator_->freeNonContiguous(allocation);
-  }
-
-  bool allocateContiguous(
-      memory::MachinePageCount numPages,
-      memory::Allocation* FOLLY_NULLABLE collateral,
-      memory::ContiguousAllocation& allocation,
-      ReservationCallback reservationCB = nullptr,
-      memory::MachinePageCount maxPages = 0) override;
-
-  void freeContiguous(memory::ContiguousAllocation& allocation) override {
-    allocator_->freeContiguous(allocation);
-  }
-
-  bool growContiguous(
-      memory::MachinePageCount increment,
-      memory::ContiguousAllocation& allocation,
-      ReservationCallback reservationCB = nullptr) override;
-
-  void* allocateBytes(uint64_t bytes, uint16_t alignment) override;
-
-  void freeBytes(void* p, uint64_t size) noexcept override {
-    allocator_->freeBytes(p, size);
-  }
-
-  bool checkConsistency() const override {
-    return allocator_->checkConsistency();
-  }
-
-  const std::vector<memory::MachinePageCount>& sizeClasses() const override {
-    return allocator_->sizeClasses();
-  }
-
-  size_t totalUsedBytes() const override {
-    return allocator_->totalUsedBytes();
-  }
-
-  memory::MachinePageCount numAllocated() const override {
-    return allocator_->numAllocated();
-  }
-
-  memory::MachinePageCount numMapped() const override {
-    return allocator_->numMapped();
-  }
-
   CacheStats refreshStats() const;
 
-  std::string toString() const override;
+  std::string toString() const;
 
   memory::MachinePageCount incrementCachedPages(int64_t pages) {
     // The counter is unsigned and the increment is signed.
@@ -719,19 +687,19 @@ class AsyncDataCache : public memory::MemoryAllocator {
     return ssdCache_.get();
   }
 
-  // Updates stats for creation of a new cache entry of 'size' bytes,
-  // i.e. a cache miss. Periodically updates SSD admission criteria,
-  // i.e. reconsider criteria every half cache capacity worth of misses.
+  /// Updates stats for creation of a new cache entry of 'size' bytes,
+  /// i.e. a cache miss. Periodically updates SSD admission criteria,
+  /// i.e. reconsider criteria every half cache capacity worth of misses.
   void incrementNew(uint64_t size);
 
-  // Updates statistics after bringing in 'bytes' worth of data that
-  // qualifies for SSD save and is not backed by SSD. Periodically
-  // triggers a background write of eligible entries to SSD.
+  /// Updates statistics after bringing in 'bytes' worth of data that
+  /// qualifies for SSD save and is not backed by SSD. Periodically
+  /// triggers a background write of eligible entries to SSD.
   void possibleSsdSave(uint64_t bytes);
 
-  // Sets a callback applied to new entries at the point where
-  //  they are set to shared mode. Used for testing and can be used for
-  // e.g. checking checksums.
+  /// Sets a callback applied to new entries at the point where
+  ///  they are set to shared mode. Used for testing and can be used for
+  /// e.g. checking checksums.
   void setVerifyHook(std::function<void(const AsyncDataCacheEntry&)> hook) {
     verifyHook_ = hook;
   }
@@ -769,29 +737,16 @@ class AsyncDataCache : public memory::MemoryAllocator {
     return numSkippedSaves_;
   }
 
-  memory::Stats stats() const override {
-    return allocator_->stats();
-  }
-
  private:
   static constexpr int32_t kNumShards = 4; // Must be power of 2.
   static constexpr int32_t kShardMask = kNumShards - 1;
 
+  static AsyncDataCache** getInstancePtr();
+
   // Waits a pseudorandom delay times 'counter'.
   void backoff(int32_t counter);
 
-  // Calls 'allocate' until this returns true. Returns true if
-  // allocate returns true. and Tries to evict at least 'numPages' of
-  // cache after each failed call to 'allocate'.  May pause to wait
-  // for SSD cache flush if ''ssdCache_' is set and is busy
-  // writing. Does random back-off after several failures and
-  // eventually gives up. Allocation must not be serialized by a mutex
-  // for memory arbitration to work.
-  bool makeSpace(
-      memory::MachinePageCount numPages,
-      std::function<bool()> allocate);
-
-  std::shared_ptr<memory::MemoryAllocator> allocator_;
+  memory::MemoryAllocator* const allocator_;
   std::unique_ptr<SsdCache> ssdCache_;
   std::vector<std::unique_ptr<CacheShard>> shards_;
   std::atomic<int32_t> shardCounter_{0};
diff --git a/velox/common/caching/tests/AsyncDataCacheTest.cpp b/velox/common/caching/tests/AsyncDataCacheTest.cpp
index 1c3580a9d860..66bb6dd65cbd 100644
--- a/velox/common/caching/tests/AsyncDataCacheTest.cpp
+++ b/velox/common/caching/tests/AsyncDataCacheTest.cpp
@@ -81,6 +81,7 @@ class AsyncDataCacheTest : public testing::Test {
       if (ssdCache) {
         ssdCache->deleteFiles();
       }
+      cache_->prepareShutdown();
     }
   }
 
@@ -104,18 +105,21 @@ class AsyncDataCacheTest : public testing::Test {
     }
     memory::MmapAllocator::Options options;
     options.capacity = maxBytes;
-    cache_ = std::make_shared<AsyncDataCache>(
-        std::make_shared<memory::MmapAllocator>(options),
-        maxBytes,
-        std::move(ssdCache));
+    if (cache_) {
+      cache_->prepareShutdown();
+    }
+    cache_.reset();
+    allocator_.reset();
+    allocator_ = std::make_shared<memory::MmapAllocator>(options);
+    cache_ = AsyncDataCache::create(allocator_.get(), std::move(ssdCache));
     if (filenames_.empty()) {
       for (auto i = 0; i < kNumFiles; ++i) {
         auto name = fmt::format("testing_file_{}", i);
         filenames_.push_back(StringIdLease(fileIds(), name));
       }
     }
-    ASSERT_EQ(cache_->kind(), MemoryAllocator::Kind::kMmap);
-    ASSERT_EQ(MemoryAllocator::kindString(cache_->kind()), "MMAP");
+    ASSERT_EQ(cache_->allocator()->kind(), MemoryAllocator::Kind::kMmap);
+    ASSERT_EQ(MemoryAllocator::kindString(cache_->allocator()->kind()), "MMAP");
   }
 
   // Finds one entry from RAM, SSD or storage. Throws if the data
@@ -222,12 +226,13 @@ class AsyncDataCacheTest : public testing::Test {
 
   void clearAllocations(std::deque<memory::Allocation>& allocations) {
     while (!allocations.empty()) {
-      cache_->freeNonContiguous(allocations.front());
+      allocator_->freeNonContiguous(allocations.front());
       allocations.pop_front();
     }
   }
 
   std::shared_ptr<exec::test::TempDirectoryPath> tempDirectory_;
+  std::shared_ptr<memory::MemoryAllocator> allocator_;
   std::shared_ptr<AsyncDataCache> cache_;
   std::vector<StringIdLease> filenames_;
   std::unique_ptr<folly::IOThreadPoolExecutor> executor_;
@@ -588,7 +593,7 @@ TEST_F(AsyncDataCacheTest, outOfCapacity) {
     pins.pop_front();
   }
   memory::Allocation allocation;
-  ASSERT_FALSE(cache_->allocateNonContiguous(kSizeInPages, allocation));
+  ASSERT_FALSE(allocator_->allocateNonContiguous(kSizeInPages, allocation));
   // One 4 page entry below the max size of 4K 4 page entries in 16MB of
   // capacity.
   ASSERT_EQ(16384, cache_->incrementCachedPages(0));
@@ -597,14 +602,14 @@ TEST_F(AsyncDataCacheTest, outOfCapacity) {
 
   // We allocate the full capacity and expect the cache entries to go.
   for (;;) {
-    if (!cache_->allocateNonContiguous(kSizeInPages, allocation)) {
+    if (!allocator_->allocateNonContiguous(kSizeInPages, allocation)) {
       break;
     }
     allocations.push_back(std::move(allocation));
   }
   EXPECT_EQ(0, cache_->incrementCachedPages(0));
   EXPECT_EQ(0, cache_->incrementPrefetchPages(0));
-  EXPECT_EQ(16384, cache_->numAllocated());
+  EXPECT_EQ(16384, allocator_->numAllocated());
   clearAllocations(allocations);
 }
 
diff --git a/velox/common/caching/tests/SsdFileTest.cpp b/velox/common/caching/tests/SsdFileTest.cpp
index 33ed1cf7f7ca..c4d7c93df2bd 100644
--- a/velox/common/caching/tests/SsdFileTest.cpp
+++ b/velox/common/caching/tests/SsdFileTest.cpp
@@ -45,6 +45,9 @@ class SsdFileTest : public testing::Test {
     if (ssdFile_) {
       ssdFile_->deleteFile();
     }
+    if (cache_) {
+      cache_->prepareShutdown();
+    }
   }
 
   void initializeCache(
@@ -53,8 +56,7 @@ class SsdFileTest : public testing::Test {
       bool setNoCowFlag = false) {
     // tmpfs does not support O_DIRECT, so turn this off for testing.
     FLAGS_ssd_odirect = false;
-    cache_ = std::make_shared<AsyncDataCache>(
-        MemoryAllocator::createDefaultInstance(), maxBytes);
+    cache_ = AsyncDataCache::create(MemoryAllocator::getInstance());
 
     fileName_ = StringIdLease(fileIds(), "fileInStorage");
 
diff --git a/velox/common/compression/Compression.cpp b/velox/common/compression/Compression.cpp
index c200a62d7021..eae2db24379c 100644
--- a/velox/common/compression/Compression.cpp
+++ b/velox/common/compression/Compression.cpp
@@ -15,11 +15,47 @@
  */
 
 #include "velox/common/compression/Compression.h"
+#include "velox/common/base/Exceptions.h"
 
 #include <folly/Conv.h>
 
 namespace facebook::velox::common {
 
+std::unique_ptr<folly::io::Codec> compressionKindToCodec(CompressionKind kind) {
+  switch (static_cast<int32_t>(kind)) {
+    case CompressionKind_NONE:
+      return getCodec(folly::io::CodecType::NO_COMPRESSION);
+    case CompressionKind_ZLIB:
+      return getCodec(folly::io::CodecType::ZLIB);
+    case CompressionKind_SNAPPY:
+      return getCodec(folly::io::CodecType::SNAPPY);
+    case CompressionKind_ZSTD:
+      return getCodec(folly::io::CodecType::ZSTD);
+    case CompressionKind_LZ4:
+      return getCodec(folly::io::CodecType::LZ4);
+    default:
+      VELOX_UNSUPPORTED(
+          "Not support {} in folly", compressionKindToString(kind));
+  }
+}
+
+CompressionKind codecTypeToCompressionKind(folly::io::CodecType type) {
+  switch (type) {
+    case folly::io::CodecType::NO_COMPRESSION:
+      return CompressionKind_NONE;
+    case folly::io::CodecType::ZLIB:
+      return CompressionKind_ZLIB;
+    case folly::io::CodecType::SNAPPY:
+      return CompressionKind_SNAPPY;
+    case folly::io::CodecType::ZSTD:
+      return CompressionKind_ZSTD;
+    case folly::io::CodecType::LZ4:
+      return CompressionKind_LZ4;
+    default:
+      VELOX_UNSUPPORTED("Not support folly codec type {}", type);
+  }
+}
+
 std::string compressionKindToString(CompressionKind kind) {
   switch (static_cast<int32_t>(kind)) {
     case CompressionKind_NONE:
diff --git a/velox/common/compression/Compression.h b/velox/common/compression/Compression.h
index 2262c7785911..c1af44fd606a 100644
--- a/velox/common/compression/Compression.h
+++ b/velox/common/compression/Compression.h
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <folly/compression/Compression.h>
 #include <string>
 
 namespace facebook::velox::common {
@@ -31,6 +32,10 @@ enum CompressionKind {
   CompressionKind_MAX = INT64_MAX
 };
 
+std::unique_ptr<folly::io::Codec> compressionKindToCodec(CompressionKind kind);
+
+CompressionKind codecTypeToCompressionKind(folly::io::CodecType type);
+
 /**
  * Get the name of the CompressionKind.
  */
diff --git a/velox/common/compression/tests/CompressionTest.cpp b/velox/common/compression/tests/CompressionTest.cpp
index b15e9207ef1e..0659fa18222e 100644
--- a/velox/common/compression/tests/CompressionTest.cpp
+++ b/velox/common/compression/tests/CompressionTest.cpp
@@ -24,7 +24,7 @@ using namespace facebook::velox::common;
 
 class CompressionTest : public testing::Test {};
 
-TEST(CompressionTest, testCompressionNames) {
+TEST_F(CompressionTest, testCompressionNames) {
   EXPECT_EQ("none", compressionKindToString(CompressionKind_NONE));
   EXPECT_EQ("zlib", compressionKindToString(CompressionKind_ZLIB));
   EXPECT_EQ("snappy", compressionKindToString(CompressionKind_SNAPPY));
@@ -35,3 +35,15 @@ TEST(CompressionTest, testCompressionNames) {
       "unknown - 99",
       compressionKindToString(static_cast<CompressionKind>(99)));
 }
+
+TEST_F(CompressionTest, compressionKindToCodec) {
+  ASSERT_EQ(
+      folly::io::CodecType::NO_COMPRESSION,
+      compressionKindToCodec(CompressionKind::CompressionKind_NONE)->type());
+  ASSERT_EQ(
+      folly::io::CodecType::LZ4,
+      compressionKindToCodec(CompressionKind::CompressionKind_LZ4)->type());
+  EXPECT_THROW(
+      compressionKindToCodec(CompressionKind::CompressionKind_LZO),
+      facebook::velox::VeloxException);
+}
diff --git a/velox/common/memory/MallocAllocator.cpp b/velox/common/memory/MallocAllocator.cpp
index df63562916cd..73c35c18a3f1 100644
--- a/velox/common/memory/MallocAllocator.cpp
+++ b/velox/common/memory/MallocAllocator.cpp
@@ -23,7 +23,7 @@ namespace facebook::velox::memory {
 MallocAllocator::MallocAllocator(size_t capacity)
     : kind_(MemoryAllocator::Kind::kMalloc), capacity_(capacity) {}
 
-bool MallocAllocator::allocateNonContiguous(
+bool MallocAllocator::allocateNonContiguousWithoutRetry(
     MachinePageCount numPages,
     Allocation& out,
     ReservationCallback reservationCB,
@@ -109,6 +109,20 @@ bool MallocAllocator::allocateNonContiguous(
   return true;
 }
 
+bool MallocAllocator::allocateContiguousWithoutRetry(
+    MachinePageCount numPages,
+    Allocation* collateral,
+    ContiguousAllocation& allocation,
+    ReservationCallback reservationCB,
+    MachinePageCount maxPages) {
+  bool result;
+  stats_.recordAllocate(AllocationTraits::pageBytes(numPages), 1, [&]() {
+    result = allocateContiguousImpl(
+        numPages, collateral, allocation, reservationCB, maxPages);
+  });
+  return result;
+}
+
 bool MallocAllocator::allocateContiguousImpl(
     MachinePageCount numPages,
     Allocation* collateral,
@@ -216,6 +230,11 @@ int64_t MallocAllocator::freeNonContiguous(Allocation& allocation) {
   return freedBytes;
 }
 
+void MallocAllocator::freeContiguous(ContiguousAllocation& allocation) {
+  stats_.recordFree(
+      allocation.size(), [&]() { freeContiguousImpl(allocation); });
+}
+
 void MallocAllocator::freeContiguousImpl(ContiguousAllocation& allocation) {
   if (allocation.empty()) {
     return;
@@ -233,7 +252,7 @@ void MallocAllocator::freeContiguousImpl(ContiguousAllocation& allocation) {
   allocation.clear();
 }
 
-bool MallocAllocator::growContiguous(
+bool MallocAllocator::growContiguousWithoutRetry(
     MachinePageCount increment,
     ContiguousAllocation& allocation,
     ReservationCallback reservationCB) {
@@ -259,7 +278,9 @@ bool MallocAllocator::growContiguous(
   return true;
 }
 
-void* MallocAllocator::allocateBytes(uint64_t bytes, uint16_t alignment) {
+void* MallocAllocator::allocateBytesWithoutRetry(
+    uint64_t bytes,
+    uint16_t alignment) {
   if (!incrementUsage(bytes)) {
     return nullptr;
   }
@@ -279,7 +300,7 @@ void* MallocAllocator::allocateBytes(uint64_t bytes, uint16_t alignment) {
   return result;
 }
 
-void* MallocAllocator::allocateZeroFilled(uint64_t bytes) {
+void* MallocAllocator::allocateZeroFilledWithoutRetry(uint64_t bytes) {
   if (!incrementUsage(bytes)) {
     return nullptr;
   }
diff --git a/velox/common/memory/MallocAllocator.h b/velox/common/memory/MallocAllocator.h
index 24503c4570a1..debc07cdc962 100644
--- a/velox/common/memory/MallocAllocator.h
+++ b/velox/common/memory/MallocAllocator.h
@@ -37,6 +37,17 @@ class MallocAllocator : public MemoryAllocator {
     }
   }
 
+  void registerCache(const std::shared_ptr<Cache>& cache) override {
+    VELOX_CHECK_NULL(cache_);
+    VELOX_CHECK_NOT_NULL(cache);
+    VELOX_CHECK(cache->allocator() == this);
+    cache_ = cache;
+  }
+
+  Cache* cache() const override {
+    return cache_.get();
+  }
+
   Kind kind() const override {
     return kind_;
   }
@@ -45,42 +56,15 @@ class MallocAllocator : public MemoryAllocator {
     return capacity_;
   }
 
-  bool allocateNonContiguous(
-      MachinePageCount numPages,
-      Allocation& out,
-      ReservationCallback reservationCB = nullptr,
-      MachinePageCount minSizeClass = 0) override;
+  void freeContiguous(ContiguousAllocation& allocation) override;
 
   int64_t freeNonContiguous(Allocation& allocation) override;
 
-  bool allocateContiguous(
-      MachinePageCount numPages,
-      Allocation* collateral,
-      ContiguousAllocation& allocation,
-      ReservationCallback reservationCB = nullptr,
-      MachinePageCount maxPages = 0) override {
-    bool result;
-    stats_.recordAllocate(AllocationTraits::pageBytes(numPages), 1, [&]() {
-      result = allocateContiguousImpl(
-          numPages, collateral, allocation, reservationCB, maxPages);
-    });
-    return result;
-  }
-
-  void freeContiguous(ContiguousAllocation& allocation) override {
-    stats_.recordFree(
-        allocation.size(), [&]() { freeContiguousImpl(allocation); });
-  }
-
-  bool growContiguous(
+  bool growContiguousWithoutRetry(
       MachinePageCount increment,
       ContiguousAllocation& allocation,
       ReservationCallback reservationCB = nullptr) override;
 
-  void* allocateBytes(uint64_t bytes, uint16_t alignment) override;
-
-  void* allocateZeroFilled(uint64_t bytes) override;
-
   void freeBytes(void* p, uint64_t bytes) noexcept override;
 
   size_t totalUsedBytes() const override {
@@ -95,15 +79,24 @@ class MallocAllocator : public MemoryAllocator {
     return numMapped_;
   }
 
-  Stats stats() const override {
-    return stats_;
-  }
-
   bool checkConsistency() const override;
 
   std::string toString() const override;
 
  private:
+  bool allocateNonContiguousWithoutRetry(
+      MachinePageCount numPages,
+      Allocation& out,
+      ReservationCallback reservationCB = nullptr,
+      MachinePageCount minSizeClass = 0) override;
+
+  bool allocateContiguousWithoutRetry(
+      MachinePageCount numPages,
+      Allocation* FOLLY_NULLABLE collateral,
+      ContiguousAllocation& allocation,
+      ReservationCallback reservationCB = nullptr,
+      MachinePageCount maxPages = 0) override;
+
   bool allocateContiguousImpl(
       MachinePageCount numPages,
       Allocation* FOLLY_NULLABLE collateral,
@@ -113,6 +106,10 @@ class MallocAllocator : public MemoryAllocator {
 
   void freeContiguousImpl(ContiguousAllocation& allocation);
 
+  void* allocateBytesWithoutRetry(uint64_t bytes, uint16_t alignment) override;
+
+  void* allocateZeroFilledWithoutRetry(uint64_t bytes) override;
+
   /// Increment current usage and check current allocator consistency to make
   /// sure current usage does not go above 'capacity_'. If it goes above
   /// 'capacity_', the increment will not be applied. Returns true if within
@@ -161,6 +158,6 @@ class MallocAllocator : public MemoryAllocator {
   /// Tracks malloc'd pointers to detect bad frees.
   std::unordered_set<void*> mallocs_;
 
-  Stats stats_;
+  std::shared_ptr<Cache> cache_;
 };
 } // namespace facebook::velox::memory
diff --git a/velox/common/memory/MemoryAllocator.cpp b/velox/common/memory/MemoryAllocator.cpp
index ccdbb4c448d9..a4a684e3384b 100644
--- a/velox/common/memory/MemoryAllocator.cpp
+++ b/velox/common/memory/MemoryAllocator.cpp
@@ -160,7 +160,74 @@ MachinePageCount MemoryAllocator::roundUpToSizeClassSize(
   return *std::lower_bound(sizes.begin(), sizes.end(), pages);
 }
 
+bool MemoryAllocator::allocateNonContiguous(
+    MachinePageCount numPages,
+    Allocation& out,
+    ReservationCallback reservationCB,
+    MachinePageCount minSizeClass) {
+  if (cache() == nullptr) {
+    return allocateNonContiguousWithoutRetry(
+        numPages, out, reservationCB, minSizeClass);
+  }
+  return cache()->makeSpace(numPages, [&]() {
+    return allocateNonContiguousWithoutRetry(
+        numPages, out, reservationCB, minSizeClass);
+  });
+}
+
+bool MemoryAllocator::allocateContiguous(
+    MachinePageCount numPages,
+    Allocation* collateral,
+    ContiguousAllocation& allocation,
+    ReservationCallback reservationCB,
+    MachinePageCount maxPages) {
+  if (cache() == nullptr) {
+    return allocateContiguousWithoutRetry(
+        numPages, collateral, allocation, reservationCB, maxPages);
+  }
+  return cache()->makeSpace(numPages, [&]() {
+    return allocateContiguousWithoutRetry(
+        numPages, collateral, allocation, reservationCB, maxPages);
+  });
+}
+
+bool MemoryAllocator::growContiguous(
+    MachinePageCount increment,
+    ContiguousAllocation& allocation,
+    ReservationCallback reservationCB) {
+  if (cache() == nullptr) {
+    return growContiguousWithoutRetry(increment, allocation, reservationCB);
+  }
+  return cache()->makeSpace(increment, [&]() {
+    return growContiguousWithoutRetry(increment, allocation, reservationCB);
+  });
+}
+
+void* MemoryAllocator::allocateBytes(uint64_t bytes, uint16_t alignment) {
+  if (cache() == nullptr) {
+    return allocateBytesWithoutRetry(bytes, alignment);
+  }
+  void* result = nullptr;
+  cache()->makeSpace(AllocationTraits::numPages(bytes), [&]() {
+    result = allocateBytesWithoutRetry(bytes, alignment);
+    return result != nullptr;
+  });
+  return result;
+}
+
 void* MemoryAllocator::allocateZeroFilled(uint64_t bytes) {
+  if (cache() == nullptr) {
+    return allocateZeroFilledWithoutRetry(bytes);
+  }
+  void* result = nullptr;
+  cache()->makeSpace(AllocationTraits::numPages(bytes), [&]() {
+    result = allocateZeroFilledWithoutRetry(bytes);
+    return result != nullptr;
+  });
+  return result;
+}
+
+void* MemoryAllocator::allocateZeroFilledWithoutRetry(uint64_t bytes) {
   void* result = allocateBytes(bytes);
   if (result != nullptr) {
     ::memset(result, 0, bytes);
diff --git a/velox/common/memory/MemoryAllocator.h b/velox/common/memory/MemoryAllocator.h
index 6cd0c76d6a6b..cae015620256 100644
--- a/velox/common/memory/MemoryAllocator.h
+++ b/velox/common/memory/MemoryAllocator.h
@@ -139,6 +139,24 @@ struct Stats {
   int64_t numAdvise{0};
 };
 
+class MemoryAllocator;
+
+/// A general cache interface using 'MemroyAllocator' to allocate memory, that
+/// is also able to free up memory upon request by shrinking itself.
+class Cache {
+ public:
+  virtual ~Cache() = default;
+  /// This method should be implemented so that it tries to accommodate the
+  /// passed in 'allocate' by freeing up space from 'this' if needed. 'numPages'
+  /// is the number of pages 'allocate' tries to allocate.It should return true
+  /// if 'allocate' succeeds, and false otherwise.
+  virtual bool makeSpace(
+      memory::MachinePageCount numPages,
+      std::function<bool()> allocate) = 0;
+
+  virtual MemoryAllocator* allocator() const = 0;
+};
+
 /// This class provides interface for the actual memory allocations from memory
 /// pool. It allocates runs of machine pages from predefined size classes, and
 /// supports both contiguous and non-contiguous memory allocations. An
@@ -193,6 +211,11 @@ class MemoryAllocator : public std::enable_shared_from_this<MemoryAllocator> {
   /// the kind of the delegated memory allocator underneath.
   virtual Kind kind() const = 0;
 
+  /// Registers a 'Cache' that is used for freeing up space when this allocator
+  /// is under memory pressure. The allocator of registered 'Cache' needs to be
+  /// the same as 'this'.
+  virtual void registerCache(const std::shared_ptr<Cache>& cache) = 0;
+
   using ReservationCallback = std::function<void(int64_t, bool)>;
 
   /// Returns the capacity of the allocator in bytes.
@@ -210,18 +233,20 @@ class MemoryAllocator : public std::enable_shared_from_this<MemoryAllocator> {
   /// 'reservationCB' before the actual memory allocation so it needs to release
   /// the reservation if the actual allocation fails halfway. The function
   /// returns true if the allocation succeeded. If it returns false, 'out'
-  /// references no memory and any partially allocated memory is freed.
+  /// references no memory and any partially allocated memory is freed. The
+  /// function might retry allocation failure by making space from 'cache()' if
+  /// registered. But sufficient space is not guaranteed.
   ///
   /// NOTE:
   ///  - 'out' is guaranteed to be freed if it's not empty.
   ///  - Allocation is not guaranteed even if collateral 'out' is larger than
   ///    'numPages', because this method is not atomic.
   ///  - Throws if allocation exceeds capacity.
-  virtual bool allocateNonContiguous(
+  bool allocateNonContiguous(
       MachinePageCount numPages,
       Allocation& out,
       ReservationCallback reservationCB = nullptr,
-      MachinePageCount minSizeClass = 0) = 0;
+      MachinePageCount minSizeClass = 0);
 
   /// Frees non-contiguous 'allocation'. 'allocation' is empty on return. The
   /// function returns the actual freed bytes.
@@ -241,19 +266,18 @@ class MemoryAllocator : public std::enable_shared_from_this<MemoryAllocator> {
   /// cleared.
   ///
   /// NOTE: - 'collateral' and passed in 'allocation' are guaranteed
-  /// to be freed.  If 'maxPages' is non-0, 'maxPages' worth of
-  /// address space is mapped but the utilization in the allocator and
-  /// pool is incremented by 'numPages'. This allows reserving
-  /// a large range of addresses for use with huge pages without
-  /// declaring the whole range as held by the query. The reservation
-  /// will be increased as and if addresses in the range are used. See
-  /// growContiguous().
-  virtual bool allocateContiguous(
+  /// to be freed. If 'maxPages' is non-0, 'maxPages' worth of address space is
+  /// mapped but the utilization in the allocator and pool is incremented by
+  /// 'numPages'. This allows reserving a large range of addresses for use with
+  /// huge pages without declaring the whole range as held by the query. The
+  /// reservation will be increased as and if addresses in the range are used.
+  /// See growContiguous().
+  bool allocateContiguous(
       MachinePageCount numPages,
       Allocation* collateral,
       ContiguousAllocation& allocation,
       ReservationCallback reservationCB = nullptr,
-      MachinePageCount maxPages = 0) = 0;
+      MachinePageCount maxPages = 0);
 
   /// Frees contiguous 'allocation'. 'allocation' is empty on return.
   virtual void freeContiguous(ContiguousAllocation& allocation) = 0;
@@ -262,24 +286,26 @@ class MemoryAllocator : public std::enable_shared_from_this<MemoryAllocator> {
   /// 'increment'. false if would exceed capacity, Throws if size
   /// would exceed maxSize given in allocateContiguous(). Calls reservationCB
   /// before increasing the utilization and returns false with no effect if this
-  /// fails.
-  virtual bool growContiguous(
+  /// fails. The function might retry allocation failure by making
+  /// space from 'cache()' if registered. But sufficient space is not guaranteed
+  bool growContiguous(
       MachinePageCount increment,
       ContiguousAllocation& allocation,
-      ReservationCallback reservationCB = nullptr) = 0;
+      ReservationCallback reservationCB = nullptr);
 
   /// Allocates contiguous 'bytes' and return the first byte. Returns nullptr if
-  /// there is no space.
+  /// there is no space. The function might retry allocation failure by making
+  /// space from 'cache()' if registered. But sufficient space is not
+  /// guaranteed.
   ///
   /// NOTE: 'alignment' must be power of two and in range of
   /// [kMinAlignment, kMaxAlignment].
-  virtual void* allocateBytes(
-      uint64_t bytes,
-      uint16_t alignment = kMinAlignment) = 0;
+  void* allocateBytes(uint64_t bytes, uint16_t alignment = kMinAlignment);
 
   /// Allocates a zero-filled contiguous bytes. Returns nullptr if there is no
-  /// space
-  virtual void* allocateZeroFilled(uint64_t bytes);
+  /// space. The function might retry allocation failure by making space from
+  /// 'cache()' if registered. But sufficient space is not guaranteed.
+  void* allocateZeroFilled(uint64_t bytes);
 
   /// Frees contiguous memory allocated by allocateBytes, allocateZeroFilled,
   /// reallocateBytes.
@@ -306,7 +332,7 @@ class MemoryAllocator : public std::enable_shared_from_this<MemoryAllocator> {
   virtual MachinePageCount numMapped() const = 0;
 
   virtual Stats stats() const {
-    return Stats();
+    return stats_;
   }
 
   virtual std::string toString() const = 0;
@@ -356,7 +382,38 @@ class MemoryAllocator : public std::enable_shared_from_this<MemoryAllocator> {
   }
 
  protected:
-  MemoryAllocator() = default;
+  explicit MemoryAllocator() = default;
+
+  /// The actual memory allocation function implementation without retry
+  /// attempts by making space from cache.
+  virtual bool allocateContiguousWithoutRetry(
+      MachinePageCount numPages,
+      Allocation* collateral,
+      ContiguousAllocation& allocation,
+      ReservationCallback reservationCB = nullptr,
+      MachinePageCount maxPages = 0) = 0;
+
+  virtual bool allocateNonContiguousWithoutRetry(
+      MachinePageCount numPages,
+      Allocation& out,
+      ReservationCallback reservationCB,
+      MachinePageCount minSizeClass) = 0;
+
+  virtual void* allocateBytesWithoutRetry(
+      uint64_t bytes,
+      uint16_t alignment) = 0;
+
+  virtual void* allocateZeroFilledWithoutRetry(uint64_t bytes);
+
+  virtual bool growContiguousWithoutRetry(
+      MachinePageCount increment,
+      ContiguousAllocation& allocation,
+      ReservationCallback reservationCB = nullptr) = 0;
+
+  // 'Cache' getter. The cache is only responsible for freeing up memory space
+  // by shrinking itself when there is not enough space upon allocating. The
+  // free of space is not guaranteed.
+  virtual Cache* cache() const = 0;
 
   // Returns the size class size that corresponds to 'bytes'.
   static MachinePageCount roundUpToSizeClassSize(
@@ -422,6 +479,8 @@ class MemoryAllocator : public std::enable_shared_from_this<MemoryAllocator> {
   InjectedFailure injectedFailure_{InjectedFailure::kNone};
   bool isPersistentFailureInjection_{false};
 
+  Stats stats_;
+
  private:
   static std::mutex initMutex_;
   // Singleton instance.
diff --git a/velox/common/memory/MmapAllocator.cpp b/velox/common/memory/MmapAllocator.cpp
index 8da35bc5bd5a..3964b2268cb5 100644
--- a/velox/common/memory/MmapAllocator.cpp
+++ b/velox/common/memory/MmapAllocator.cpp
@@ -51,7 +51,7 @@ MmapAllocator::~MmapAllocator() {
       (numAllocated_ == 0) && (numExternalMapped_ == 0), "{}", toString());
 }
 
-bool MmapAllocator::allocateNonContiguous(
+bool MmapAllocator::allocateNonContiguousWithoutRetry(
     MachinePageCount numPages,
     Allocation& out,
     ReservationCallback reservationCB,
@@ -211,6 +211,20 @@ MachinePageCount MmapAllocator::freeInternal(Allocation& allocation) {
   return numFreed;
 }
 
+bool MmapAllocator::allocateContiguousWithoutRetry(
+    MachinePageCount numPages,
+    Allocation* collateral,
+    ContiguousAllocation& allocation,
+    ReservationCallback reservationCB,
+    MachinePageCount maxPages) {
+  bool result;
+  stats_.recordAllocate(AllocationTraits::pageBytes(numPages), 1, [&]() {
+    result = allocateContiguousImpl(
+        numPages, collateral, allocation, reservationCB, maxPages);
+  });
+  return result;
+}
+
 bool MmapAllocator::allocateContiguousImpl(
     MachinePageCount numPages,
     Allocation* collateral,
@@ -364,6 +378,11 @@ bool MmapAllocator::allocateContiguousImpl(
   return true;
 }
 
+void MmapAllocator::freeContiguous(ContiguousAllocation& allocation) {
+  stats_.recordFree(
+      allocation.size(), [&]() { freeContiguousImpl(allocation); });
+}
+
 void MmapAllocator::freeContiguousImpl(ContiguousAllocation& allocation) {
   if (allocation.empty()) {
     return;
@@ -384,7 +403,7 @@ void MmapAllocator::freeContiguousImpl(ContiguousAllocation& allocation) {
   allocation.clear();
 }
 
-bool MmapAllocator::growContiguous(
+bool MmapAllocator::growContiguousWithoutRetry(
     MachinePageCount increment,
     ContiguousAllocation& allocation,
     ReservationCallback reservationCB) {
@@ -430,7 +449,9 @@ bool MmapAllocator::growContiguous(
   return true;
 }
 
-void* MmapAllocator::allocateBytes(uint64_t bytes, uint16_t alignment) {
+void* MmapAllocator::allocateBytesWithoutRetry(
+    uint64_t bytes,
+    uint16_t alignment) {
   alignmentCheck(bytes, alignment);
 
   if (useMalloc(bytes)) {
@@ -448,7 +469,8 @@ void* MmapAllocator::allocateBytes(uint64_t bytes, uint16_t alignment) {
   if (bytes <= AllocationTraits::pageBytes(sizeClassSizes_.back())) {
     Allocation allocation;
     const auto numPages = roundUpToSizeClassSize(bytes, sizeClassSizes_);
-    if (!allocateNonContiguous(numPages, allocation, nullptr, numPages)) {
+    if (!allocateNonContiguousWithoutRetry(
+            numPages, allocation, nullptr, numPages)) {
       return nullptr;
     }
     auto run = allocation.runAt(0);
@@ -463,7 +485,7 @@ void* MmapAllocator::allocateBytes(uint64_t bytes, uint16_t alignment) {
   ContiguousAllocation allocation;
   auto numPages = bits::roundUp(bytes, AllocationTraits::kPageSize) /
       AllocationTraits::kPageSize;
-  if (!allocateContiguous(numPages, nullptr, allocation)) {
+  if (!allocateContiguousWithoutRetry(numPages, nullptr, allocation)) {
     return nullptr;
   }
 
diff --git a/velox/common/memory/MmapAllocator.h b/velox/common/memory/MmapAllocator.h
index 0ecc872e17c9..43863af9f535 100644
--- a/velox/common/memory/MmapAllocator.h
+++ b/velox/common/memory/MmapAllocator.h
@@ -85,56 +85,29 @@ class MmapAllocator : public MemoryAllocator {
     return kind_;
   }
 
-  size_t capacity() const override {
-    return AllocationTraits::pageBytes(capacity_);
+  void registerCache(const std::shared_ptr<Cache>& cache) override {
+    VELOX_CHECK_NULL(cache_);
+    VELOX_CHECK_NOT_NULL(cache);
+    VELOX_CHECK(cache->allocator() == this);
+    cache_ = cache;
   }
 
-  bool allocateNonContiguous(
-      MachinePageCount numPages,
-      Allocation& out,
-      ReservationCallback reservationCB = nullptr,
-      MachinePageCount minSizeClass = 0) override;
-
-  int64_t freeNonContiguous(Allocation& allocation) override;
-
-  bool allocateContiguous(
-      MachinePageCount numPages,
-      Allocation* collateral,
-      ContiguousAllocation& allocation,
-      ReservationCallback reservationCB = nullptr,
-      MachinePageCount maxPages = 0) override {
-    bool result;
-    stats_.recordAllocate(numPages * AllocationTraits::kPageSize, 1, [&]() {
-      result = allocateContiguousImpl(
-          numPages, collateral, allocation, reservationCB, maxPages);
-    });
-    return result;
+  Cache* cache() const override {
+    return cache_.get();
   }
 
-  void freeContiguous(ContiguousAllocation& allocation) override {
-    stats_.recordFree(
-        allocation.size(), [&]() { freeContiguousImpl(allocation); });
+  size_t capacity() const override {
+    return AllocationTraits::pageBytes(capacity_);
   }
 
-  bool growContiguous(
+  bool growContiguousWithoutRetry(
       MachinePageCount increment,
       ContiguousAllocation& allocation,
       ReservationCallback reservationCB = nullptr) override;
 
-  /// Allocates 'bytes' contiguous bytes and returns the pointer to the first
-  /// byte. If 'bytes' is less than 'maxMallocBytes_', delegates the allocation
-  /// to malloc. If the size is above that and below the largest size classes'
-  /// size, allocates one element of the next size classes' size. If 'size' is
-  /// greater than the largest size classes' size, calls allocateContiguous().
-  /// Returns nullptr if there is no space. The amount to allocate is subject to
-  /// the size limit of 'this'. This function is not virtual but calls the
-  /// virtual functions allocateNonContiguous and allocateContiguous, which can
-  /// track sizes and enforce caps etc. If 'alignment' is not kMinAlignment,
-  /// then 'bytes' must be a multiple of 'alignment'.
-  ///
-  /// NOTE: 'alignment' must be power of two and in range of [kMinAlignment,
-  /// kMaxAlignment].
-  void* allocateBytes(uint64_t bytes, uint16_t alignment) override;
+  void freeContiguous(ContiguousAllocation& allocation) override;
+
+  int64_t freeNonContiguous(Allocation& allocation) override;
 
   void freeBytes(void* p, uint64_t bytes) noexcept override;
 
@@ -339,6 +312,19 @@ class MmapAllocator : public MemoryAllocator {
     uint64_t numAdvisedAway_ = 0;
   };
 
+  bool allocateNonContiguousWithoutRetry(
+      MachinePageCount numPages,
+      Allocation& out,
+      ReservationCallback reservationCB = nullptr,
+      MachinePageCount minSizeClass = 0) override;
+
+  bool allocateContiguousWithoutRetry(
+      MachinePageCount numPages,
+      Allocation* collateral,
+      ContiguousAllocation& allocation,
+      ReservationCallback reservationCB = nullptr,
+      MachinePageCount maxPages = 0) override;
+
   bool allocateContiguousImpl(
       MachinePageCount numPages,
       Allocation* collateral,
@@ -348,6 +334,21 @@ class MmapAllocator : public MemoryAllocator {
 
   void freeContiguousImpl(ContiguousAllocation& allocation);
 
+  // Allocates 'bytes' contiguous bytes and returns the pointer to the first
+  // byte. If 'bytes' is less than 'maxMallocBytes_', delegates the allocation
+  // to malloc. If the size is above that and below the largest size classes'
+  // size, allocates one element of the next size classes' size. If 'size' is
+  // greater than the largest size classes' size, calls allocateContiguous().
+  // Returns nullptr if there is no space. The amount to allocate is subject to
+  // the size limit of 'this'. This function is not virtual but calls the
+  // virtual functions allocateNonContiguous and allocateContiguous, which can
+  // track sizes and enforce caps etc. If 'alignment' is not kMinAlignment,
+  // then 'bytes' must be a multiple of 'alignment'.
+  //
+  // NOTE: 'alignment' must be power of two and in range of [kMinAlignment,
+  // kMaxAlignment].
+  void* allocateBytesWithoutRetry(uint64_t bytes, uint16_t alignment) override;
+
   // Ensures that there are at least 'newMappedNeeded' pages that are
   // not backing any existing allocation. If capacity_ - numMapped_ <
   // newMappedNeeded, advises away enough pages backing freed slots in
@@ -418,7 +419,7 @@ class MmapAllocator : public MemoryAllocator {
   std::mutex arenaMutex_;
   std::unique_ptr<ManagedMmapArenas> managedArenas_;
 
-  Stats stats_;
+  std::shared_ptr<Cache> cache_;
 };
 
 } // namespace facebook::velox::memory
diff --git a/velox/common/memory/tests/MemoryPoolTest.cpp b/velox/common/memory/tests/MemoryPoolTest.cpp
index e2e71c3496c7..eabc39a3a958 100644
--- a/velox/common/memory/tests/MemoryPoolTest.cpp
+++ b/velox/common/memory/tests/MemoryPoolTest.cpp
@@ -88,18 +88,16 @@ class MemoryPoolTest : public testing::TestWithParam<TestParam> {
       MmapAllocator::Options opts{8UL << 30};
       allocator_ = std::make_shared<MmapAllocator>(opts);
       if (useCache_) {
-        cache_ =
-            std::make_shared<AsyncDataCache>(allocator_, kCapacity, nullptr);
-        MemoryAllocator::setDefaultInstance(cache_.get());
+        cache_ = AsyncDataCache::create(allocator_.get());
+        MemoryAllocator::setDefaultInstance(allocator_.get());
       } else {
         MemoryAllocator::setDefaultInstance(allocator_.get());
       }
     } else {
       allocator_ = MemoryAllocator::createDefaultInstance();
       if (useCache_) {
-        cache_ =
-            std::make_shared<AsyncDataCache>(allocator_, kCapacity, nullptr);
-        MemoryAllocator::setDefaultInstance(cache_.get());
+        cache_ = AsyncDataCache::create(allocator_.get());
+        MemoryAllocator::setDefaultInstance(allocator_.get());
       } else {
         MemoryAllocator::setDefaultInstance(allocator_.get());
       }
@@ -111,6 +109,9 @@ class MemoryPoolTest : public testing::TestWithParam<TestParam> {
   }
 
   void TearDown() override {
+    if (useCache_) {
+      cache_->prepareShutdown();
+    }
     allocator_->testingClearFailureInjection();
     MmapAllocator::setDefaultInstance(nullptr);
   }
diff --git a/velox/common/memory/tests/SharedArbitratorTest.cpp b/velox/common/memory/tests/SharedArbitratorTest.cpp
index c4a04ed84f7c..660974cd5d97 100644
--- a/velox/common/memory/tests/SharedArbitratorTest.cpp
+++ b/velox/common/memory/tests/SharedArbitratorTest.cpp
@@ -27,6 +27,7 @@
 #include "velox/common/memory/MemoryArbitrator.h"
 #include "velox/common/memory/SharedArbitrator.h"
 #include "velox/common/testutil/TestValue.h"
+#include "velox/exec/HashTable.h"
 #include "velox/exec/tests/utils/AssertQueryBuilder.h"
 #include "velox/exec/tests/utils/HiveConnectorTestBase.h"
 #include "velox/exec/tests/utils/PlanBuilder.h"
@@ -311,7 +312,7 @@ class SharedArbitrationTest : public exec::test::HiveConnectorTestBase {
         executor_.get(),
         std::unordered_map<std::string, std::string>{},
         configs,
-        memory::MemoryAllocator::getInstance(),
+        cache::AsyncDataCache::getInstance(),
         std::move(pool));
     return queryCtx;
   }
@@ -1584,113 +1585,43 @@ DEBUG_ONLY_TEST_F(
   createDuckDbTable(vectors);
 
   std::shared_ptr<core::QueryCtx> joinQueryCtx = newQueryCtx(kMemoryCapacity);
-  std::shared_ptr<core::QueryCtx> fakeQueryCtx = newQueryCtx(kMemoryCapacity);
 
-  // Set fake operator to reclaimable to allow arbitration to succeed.
-  fakeOperatorFactory_->setCanReclaim(true);
-
-  folly::EventCount waitForPrepareJoin;
-  folly::EventCount waitForFakeAllocationDone;
-  std::atomic<bool> fakeAllocationDone{false};
-  std::atomic<bool> startPrepareJoin{false};
-  fakeOperatorFactory_->setAllocationCallback([&](Operator* op) {
-    if (fakeAllocationDone) {
-      return Allocation{};
-    }
-
-    // Wait for hash join build to start table build at the end of hash build
-    // phase.
-    waitForPrepareJoin.await([&]() { return startPrepareJoin.load(); });
-
-    // Set to allocate all the remaining free memory from the arbitrator.
-    const auto allocationSize =
-        kMemoryCapacity - joinQueryCtx->pool()->currentBytes();
-    auto buffer = op->pool()->allocate(allocationSize);
-    // Unblock table build and expect any memory allocation by parallel table
-    // build to trigger memory arbitration.
-    fakeAllocationDone = true;
-    waitForFakeAllocationDone.notifyAll();
-    return Allocation{op->pool(), buffer, allocationSize};
-  });
-
-  std::vector<Allocation> extraAllocations;
+  // Make sure the parallel build has been triggered.
+  std::atomic<bool> parallelBuildTriggered{false};
   SCOPED_TESTVALUE_SET(
-      "facebook::velox::exec::HashBuild::prepareJoinTable",
-      std::function<void(std::vector<Operator*>*)>(
-          ([&](std::vector<Operator*>* buildOps) {
-            // Free up the unused memory reservations from all the hash build
-            // memory pool to ensure triggering memory arbitration in parallel
-            // build.
-            for (auto* op : *buildOps) {
-              const size_t allocationSize = op->pool()->availableReservation();
-              if (allocationSize > 0) {
-                extraAllocations.push_back(Allocation{
-                    op->pool(),
-                    op->pool()->allocate(allocationSize),
-                    allocationSize});
-              }
-            }
-            // Unblock fake memory allocation to allocate all the freed memory
-            // from arbitrator.
-            startPrepareJoin = true;
-            waitForPrepareJoin.notifyAll();
-            // Wait for the fake memory allocation to complete before
-            // proceeding with the parallel build.
-            waitForFakeAllocationDone.await(
-                [&]() { return fakeAllocationDone.load(); });
-          })));
-
-  std::thread joinThread([&]() {
-    auto planNodeIdGenerator = std::make_shared<core::PlanNodeIdGenerator>();
-    auto task =
-        AssertQueryBuilder(duckDbQueryRunner_)
-            // Set very low table size threshold to trigger parallel build.
-            .config(
-                core::QueryConfig::kMinTableRowsForParallelJoinBuild,
-                std::to_string(0))
-            // Set multiple hash build drivers to trigger parallel build.
-            .maxDrivers(4)
-            .queryCtx(joinQueryCtx)
-            .plan(PlanBuilder(planNodeIdGenerator)
-                      .values(vectors, true)
-                      .project({"c0 AS t0", "c1 AS t1", "c2 AS t2"})
-                      .hashJoin(
-                          {"t0", "t1"},
-                          {"u1", "u0"},
-                          PlanBuilder(planNodeIdGenerator)
-                              .values(vectors, true)
-                              .project({"c0 AS u0", "c1 AS u1", "c2 AS u2"})
-                              .planNode(),
-                          "",
-                          {"t1"},
-                          core::JoinType::kInner)
-                      .planNode())
-            .assertResults(
-                "SELECT t.c1 FROM tmp as t, tmp AS u WHERE t.c0 == u.c1 AND t.c1 == u.c0");
+      "facebook::velox::exec::HashTable::parallelJoinBuild",
+      std::function<void(void*)>(
+          [&](void*) { parallelBuildTriggered = true; }));
 
-    // Free up the extra memory allocations.
-    for (auto& allocation : extraAllocations) {
-      allocation.free();
-    }
-    extraAllocations.clear();
-  });
+  // TODO: add driver context to test if the memory allocation is triggered in
+  // driver context or not.
 
-  std::shared_ptr<Task> fakeMemoryTask;
-  std::thread memThread([&]() {
-    fakeMemoryTask =
-        AssertQueryBuilder(duckDbQueryRunner_)
-            .queryCtx(fakeQueryCtx)
-            .plan(PlanBuilder()
-                      .values(vectors)
-                      .addNode([&](std::string id, core::PlanNodePtr input) {
-                        return std::make_shared<FakeMemoryNode>(id, input);
-                      })
-                      .planNode())
-            .assertResults("SELECT * FROM tmp");
-  });
-  joinThread.join();
-  memThread.join();
-  fakeMemoryTask.reset();
+  auto planNodeIdGenerator = std::make_shared<core::PlanNodeIdGenerator>();
+  AssertQueryBuilder(duckDbQueryRunner_)
+      // Set very low table size threshold to trigger parallel build.
+      .config(
+          core::QueryConfig::kMinTableRowsForParallelJoinBuild,
+          std::to_string(0))
+      // Set multiple hash build drivers to trigger parallel build.
+      .maxDrivers(4)
+      .queryCtx(joinQueryCtx)
+      .plan(PlanBuilder(planNodeIdGenerator)
+                .values(vectors, true)
+                .project({"c0 AS t0", "c1 AS t1", "c2 AS t2"})
+                .hashJoin(
+                    {"t0", "t1"},
+                    {"u1", "u0"},
+                    PlanBuilder(planNodeIdGenerator)
+                        .values(vectors, true)
+                        .project({"c0 AS u0", "c1 AS u1", "c2 AS u2"})
+                        .planNode(),
+                    "",
+                    {"t1"},
+                    core::JoinType::kInner)
+                .planNode())
+      .assertResults(
+          "SELECT t.c1 FROM tmp as t, tmp AS u WHERE t.c0 == u.c1 AND t.c1 == u.c0");
+  ASSERT_TRUE(parallelBuildTriggered);
   Task::testingWaitForAllTasksToBeDeleted();
 }
 
diff --git a/velox/connectors/Connector.h b/velox/connectors/Connector.h
index e11d5f600892..b8fa93f191dc 100644
--- a/velox/connectors/Connector.h
+++ b/velox/connectors/Connector.h
@@ -17,6 +17,7 @@
 
 #include "velox/common/base/AsyncSource.h"
 #include "velox/common/base/RuntimeMetrics.h"
+#include "velox/common/caching/AsyncDataCache.h"
 #include "velox/common/caching/ScanTracker.h"
 #include "velox/common/future/VeloxPromise.h"
 #include "velox/core/ExpressionEvaluator.h"
@@ -223,7 +224,7 @@ class ConnectorQueryCtx {
       memory::MemoryPool* connectorPool,
       const Config* connectorConfig,
       std::unique_ptr<core::ExpressionEvaluator> expressionEvaluator,
-      memory::MemoryAllocator* FOLLY_NONNULL allocator,
+      cache::AsyncDataCache* cache,
       const std::string& queryId,
       const std::string& taskId,
       const std::string& planNodeId,
@@ -232,7 +233,7 @@ class ConnectorQueryCtx {
         connectorPool_(connectorPool),
         config_(connectorConfig),
         expressionEvaluator_(std::move(expressionEvaluator)),
-        allocator_(allocator),
+        cache_(cache),
         scanId_(fmt::format("{}.{}", taskId, planNodeId)),
         queryId_(queryId),
         taskId_(taskId),
@@ -260,10 +261,8 @@ class ConnectorQueryCtx {
     return expressionEvaluator_.get();
   }
 
-  // MemoryAllocator for large allocations. Used for caching with
-  // CachedBufferedImput if this implements cache::AsyncDataCache.
-  memory::MemoryAllocator* FOLLY_NONNULL allocator() const {
-    return allocator_;
+  cache::AsyncDataCache* cache() const {
+    return cache_;
   }
 
   // This is a combination of task id and the scan's PlanNodeId. This is an id
@@ -295,7 +294,7 @@ class ConnectorQueryCtx {
   memory::MemoryPool* connectorPool_;
   const Config* FOLLY_NONNULL config_;
   std::unique_ptr<core::ExpressionEvaluator> expressionEvaluator_;
-  memory::MemoryAllocator* FOLLY_NONNULL allocator_;
+  cache::AsyncDataCache* cache_;
   const std::string scanId_;
   const std::string queryId_;
   const std::string taskId_;
diff --git a/velox/connectors/hive/HiveConfig.cpp b/velox/connectors/hive/HiveConfig.cpp
index 534f7887371a..cc7d1aa6564f 100644
--- a/velox/connectors/hive/HiveConfig.cpp
+++ b/velox/connectors/hive/HiveConfig.cpp
@@ -155,4 +155,9 @@ int32_t HiveConfig::maxCoalescedDistanceBytes(const Config* config) {
   return config->get<int32_t>(kMaxCoalescedDistanceBytes, 512 << 10);
 }
 
+// static.
+int32_t HiveConfig::numCacheFileHandles(const Config* config) {
+  return config->get<int32_t>(kNumCacheFileHandles, 20'000);
+}
+
 } // namespace facebook::velox::connector::hive
diff --git a/velox/connectors/hive/HiveConfig.h b/velox/connectors/hive/HiveConfig.h
index 131ce587d36d..b792bdbec8eb 100644
--- a/velox/connectors/hive/HiveConfig.h
+++ b/velox/connectors/hive/HiveConfig.h
@@ -100,6 +100,9 @@ class HiveConfig {
   static constexpr const char* kMaxCoalescedDistanceBytes =
       "max-coalesced-distance-bytes";
 
+  /// Maximum number of entries in the file handle cache.
+  static constexpr const char* kNumCacheFileHandles = "num_cached_file_handles";
+
   static InsertExistingPartitionsBehavior insertExistingPartitionsBehavior(
       const Config* config);
 
@@ -136,6 +139,8 @@ class HiveConfig {
   static int64_t maxCoalescedBytes(const Config* config);
 
   static int32_t maxCoalescedDistanceBytes(const Config* config);
+
+  static int32_t numCacheFileHandles(const Config* config);
 };
 
 } // namespace facebook::velox::connector::hive
diff --git a/velox/connectors/hive/HiveConnector.cpp b/velox/connectors/hive/HiveConnector.cpp
index 4b245722ad85..1287ba2d3826 100644
--- a/velox/connectors/hive/HiveConnector.cpp
+++ b/velox/connectors/hive/HiveConnector.cpp
@@ -43,13 +43,12 @@
 using namespace facebook::velox::exec;
 using namespace facebook::velox::dwrf;
 
-DEFINE_int32(
-    num_file_handle_cache,
-    20'000,
-    "Max number of file handles to cache.");
-
 namespace facebook::velox::connector::hive {
 
+int32_t numCachedFileHandles(const Config* properties) {
+  return properties ? HiveConfig::numCacheFileHandles(properties) : 20'000;
+}
+
 HiveConnector::HiveConnector(
     const std::string& id,
     std::shared_ptr<const Config> properties,
@@ -58,10 +57,13 @@ HiveConnector::HiveConnector(
       fileHandleFactory_(
           std::make_unique<
               SimpleLRUCache<std::string, std::shared_ptr<FileHandle>>>(
-              FLAGS_num_file_handle_cache),
-          std::make_unique<FileHandleGenerator>(std::move(properties))),
-      executor_(executor) {}
-
+              numCachedFileHandles(properties.get())),
+          std::make_unique<FileHandleGenerator>(properties)),
+      executor_(executor) {
+  LOG(INFO) << "Hive connector " << connectorId() << " created with maximum of "
+            << numCachedFileHandles(properties.get())
+            << " cached file handles.";
+}
 std::unique_ptr<core::PartitionFunction> HivePartitionFunctionSpec::create(
     int numPartitions) const {
   std::vector<int> bucketToPartitions;
diff --git a/velox/connectors/hive/HiveConnector.h b/velox/connectors/hive/HiveConnector.h
index 5970b9df80c1..795c0f9845ff 100644
--- a/velox/connectors/hive/HiveConnector.h
+++ b/velox/connectors/hive/HiveConnector.h
@@ -46,16 +46,17 @@ class HiveConnector : public Connector {
         HiveConfig::maxCoalescedBytes(connectorQueryCtx->config()));
     options.setMaxCoalesceDistance(
         HiveConfig::maxCoalescedDistanceBytes(connectorQueryCtx->config()));
+    options.setFileColumnNamesReadAsLowerCase(
+        HiveConfig::isFileColumnNamesReadAsLowerCase(
+            connectorQueryCtx->config()));
     return std::make_unique<HiveDataSource>(
         outputType,
         tableHandle,
         columnHandles,
         &fileHandleFactory_,
         connectorQueryCtx->expressionEvaluator(),
-        connectorQueryCtx->allocator(),
+        connectorQueryCtx->cache(),
         connectorQueryCtx->scanId(),
-        HiveConfig::isFileColumnNamesReadAsLowerCase(
-            connectorQueryCtx->config()),
         executor_,
         options);
   }
diff --git a/velox/connectors/hive/HiveDataSource.cpp b/velox/connectors/hive/HiveDataSource.cpp
index 74b9f56f42d7..46de1360b58d 100644
--- a/velox/connectors/hive/HiveDataSource.cpp
+++ b/velox/connectors/hive/HiveDataSource.cpp
@@ -63,25 +63,39 @@ bool applyPartitionFilter(
   }
 }
 
+struct SubfieldSpec {
+  const common::Subfield* subfield;
+  bool filterOnly;
+};
+
+template <typename T>
+void deduplicate(std::vector<T>& values) {
+  std::sort(values.begin(), values.end());
+  values.erase(std::unique(values.begin(), values.end()), values.end());
+}
+
 // Recursively add subfields to scan spec.
 void addSubfields(
     const Type& type,
-    const std::vector<const common::Subfield*>& subfields,
+    std::vector<SubfieldSpec>& subfields,
     int level,
     memory::MemoryPool* pool,
     common::ScanSpec& spec) {
-  for (auto& subfield : subfields) {
-    if (level == subfield->path().size()) {
+  int newSize = 0;
+  for (int i = 0; i < subfields.size(); ++i) {
+    if (level < subfields[i].subfield->path().size()) {
+      subfields[newSize++] = subfields[i];
+    } else if (!subfields[i].filterOnly) {
       spec.addAllChildFields(type);
       return;
     }
   }
+  subfields.resize(newSize);
   switch (type.kind()) {
     case TypeKind::ROW: {
-      folly::F14FastMap<std::string, std::vector<const common::Subfield*>>
-          required;
+      folly::F14FastMap<std::string, std::vector<SubfieldSpec>> required;
       for (auto& subfield : subfields) {
-        auto* element = subfield->path()[level].get();
+        auto* element = subfield.subfield->path()[level].get();
         auto* nestedField =
             dynamic_cast<const common::Subfield::NestedField*>(element);
         VELOX_CHECK(
@@ -114,11 +128,14 @@ void addSubfields(
           level + 1,
           pool,
           *spec.addMapValueField());
+      if (subfields.empty()) {
+        return;
+      }
       bool stringKey = keyType->isVarchar() || keyType->isVarbinary();
       std::vector<std::string> stringSubscripts;
       std::vector<int64_t> longSubscripts;
       for (auto& subfield : subfields) {
-        auto* element = subfield->path()[level].get();
+        auto* element = subfield.subfield->path()[level].get();
         if (dynamic_cast<const common::Subfield::AllSubscripts*>(element)) {
           return;
         }
@@ -142,8 +159,10 @@ void addSubfields(
       }
       std::unique_ptr<common::Filter> filter;
       if (stringKey) {
+        deduplicate(stringSubscripts);
         filter = std::make_unique<common::BytesValues>(stringSubscripts, false);
       } else {
+        deduplicate(longSubscripts);
         filter = common::createBigintValues(longSubscripts, false);
       }
       keys->setFilter(std::move(filter));
@@ -156,10 +175,13 @@ void addSubfields(
           level + 1,
           pool,
           *spec.addArrayElementField());
+      if (subfields.empty()) {
+        return;
+      }
       constexpr long kMaxIndex = std::numeric_limits<vector_size_t>::max();
       long maxIndex = -1;
       for (auto& subfield : subfields) {
-        auto* element = subfield->path()[level].get();
+        auto* element = subfield.subfield->path()[level].get();
         if (dynamic_cast<const common::Subfield::AllSubscripts*>(element)) {
           return;
         }
@@ -175,7 +197,7 @@ void addSubfields(
       break;
     }
     default:
-      VELOX_FAIL("Subfields pruning not supported on type {}", type.toString());
+      break;
   }
 }
 
@@ -292,6 +314,9 @@ void checkColumnNameLowerCase(const SubfieldFilters& filters) {
 }
 
 void checkColumnNameLowerCase(const core::TypedExprPtr& typeExpr) {
+  if (typeExpr == nullptr) {
+    return;
+  }
   checkColumnNameLowerCase(typeExpr->type());
   for (auto& type : typeExpr->inputs()) {
     checkColumnNameLowerCase(type);
@@ -358,9 +383,8 @@ HiveDataSource::HiveDataSource(
         std::shared_ptr<connector::ColumnHandle>>& columnHandles,
     FileHandleFactory* fileHandleFactory,
     core::ExpressionEvaluator* expressionEvaluator,
-    memory::MemoryAllocator* allocator,
+    cache::AsyncDataCache* cache,
     const std::string& scanId,
-    bool fileColumnNamesReadAsLowerCase,
     folly::Executor* executor,
     const dwio::common::ReaderOptions& options)
     : fileHandleFactory_(fileHandleFactory),
@@ -368,7 +392,7 @@ HiveDataSource::HiveDataSource(
       pool_(&options.getMemoryPool()),
       outputType_(outputType),
       expressionEvaluator_(expressionEvaluator),
-      allocator_(allocator),
+      cache_(cache),
       scanId_(scanId),
       executor_(executor) {
   // Column handled keyed on the column alias, the name used in the query.
@@ -405,7 +429,7 @@ HiveDataSource::HiveDataSource(
   VELOX_CHECK(
       hiveTableHandle != nullptr,
       "TableHandle must be an instance of HiveTableHandle");
-  if (fileColumnNamesReadAsLowerCase) {
+  if (readerOpts_.isFileColumnNamesReadAsLowerCase()) {
     checkColumnNameLowerCase(outputType);
     checkColumnNameLowerCase(hiveTableHandle->subfieldFilters());
     checkColumnNameLowerCase(hiveTableHandle->remainingFilter());
@@ -796,22 +820,22 @@ std::shared_ptr<common::ScanSpec> HiveDataSource::makeScanSpec(
       spec->addFieldRecursively(name, *type, i);
       continue;
     }
-    std::vector<const common::Subfield*> subfieldPtrs;
+    std::vector<SubfieldSpec> subfieldSpecs;
     for (auto& subfield : subfields) {
       VELOX_CHECK_GT(subfield.path().size(), 0);
       auto* field = dynamic_cast<const common::Subfield::NestedField*>(
           subfield.path()[0].get());
       VELOX_CHECK(field);
       VELOX_CHECK_EQ(field->name(), name);
-      subfieldPtrs.push_back(&subfield);
+      subfieldSpecs.push_back({&subfield, false});
     }
     if (auto it = requiredSubfieldsInFilters.find(name);
         it != requiredSubfieldsInFilters.end()) {
       for (auto* subfield : it->second) {
-        subfieldPtrs.push_back(subfield);
+        subfieldSpecs.push_back({subfield, true});
       }
     }
-    addSubfields(*type, subfieldPtrs, 1, pool, *spec->addField(name, i));
+    addSubfields(*type, subfieldSpecs, 1, pool, *spec->addField(name, i));
   }
 
   for (auto& pair : filters) {
@@ -835,12 +859,12 @@ std::unique_ptr<dwio::common::BufferedInput>
 HiveDataSource::createBufferedInput(
     const FileHandle& fileHandle,
     const dwio::common::ReaderOptions& readerOpts) {
-  if (auto* asyncCache = dynamic_cast<cache::AsyncDataCache*>(allocator_)) {
+  if (cache_) {
     return std::make_unique<dwio::common::CachedBufferedInput>(
         fileHandle.file,
         dwio::common::MetricsLog::voidLog(),
         fileHandle.uuid.id(),
-        asyncCache,
+        cache_,
         Connector::getTracker(scanId_, readerOpts.loadQuantum()),
         fileHandle.groupId.id(),
         ioStats_,
diff --git a/velox/connectors/hive/HiveDataSource.h b/velox/connectors/hive/HiveDataSource.h
index c5db04e897c2..74e93195e68b 100644
--- a/velox/connectors/hive/HiveDataSource.h
+++ b/velox/connectors/hive/HiveDataSource.h
@@ -37,9 +37,8 @@ class HiveDataSource : public DataSource {
           std::shared_ptr<connector::ColumnHandle>>& columnHandles,
       FileHandleFactory* fileHandleFactory,
       core::ExpressionEvaluator* expressionEvaluator,
-      memory::MemoryAllocator* allocator,
+      cache::AsyncDataCache* cache,
       const std::string& scanId,
-      bool fileColumnNamesReadAsLowerCase,
       folly::Executor* executor,
       const dwio::common::ReaderOptions& options);
 
@@ -162,7 +161,7 @@ class HiveDataSource : public DataSource {
   SelectivityVector filterRows_;
   exec::FilterEvalCtx filterEvalCtx_;
 
-  memory::MemoryAllocator* const allocator_;
+  cache::AsyncDataCache* const cache_{nullptr};
   const std::string& scanId_;
   folly::Executor* executor_;
 };
diff --git a/velox/connectors/hive/tests/HiveConnectorTest.cpp b/velox/connectors/hive/tests/HiveConnectorTest.cpp
index a8899dc465cc..fdbd12b994d2 100644
--- a/velox/connectors/hive/tests/HiveConnectorTest.cpp
+++ b/velox/connectors/hive/tests/HiveConnectorTest.cpp
@@ -184,11 +184,18 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_allSubscripts) {
 }
 
 TEST_F(HiveConnectorTest, makeScanSpec_filtersNotInRequiredSubfields) {
-  auto columnType = ROW({{"c0c0", BIGINT()}, {"c0c1", VARCHAR()}});
+  auto columnType = ROW(
+      {{"c0c0", BIGINT()},
+       {"c0c1", VARCHAR()},
+       {"c0c2", ROW({{"c0c2c0", BIGINT()}})},
+       {"c0c3", ROW({{"c0c3c0", BIGINT()}})}});
   auto rowType = ROW({{"c0", columnType}});
-  auto columnHandle = makeColumnHandle("c0", columnType, {"c0.c0c1"});
+  auto columnHandle =
+      makeColumnHandle("c0", columnType, {"c0.c0c1", "c0.c0c3"});
   SubfieldFilters filters;
   filters.emplace(Subfield("c0.c0c0"), exec::equal(42));
+  filters.emplace(Subfield("c0.c0c2"), exec::isNotNull());
+  filters.emplace(Subfield("c0.c0c3"), exec::isNotNull());
   auto scanSpec = HiveDataSource::makeScanSpec(
       filters, rowType, {columnHandle.get()}, {}, pool_.get());
   auto* c0c0 = scanSpec->childByName("c0")->childByName("c0c0");
@@ -197,6 +204,34 @@ TEST_F(HiveConnectorTest, makeScanSpec_filtersNotInRequiredSubfields) {
   auto* c0c1 = scanSpec->childByName("c0")->childByName("c0c1");
   ASSERT_FALSE(c0c1->isConstant());
   ASSERT_FALSE(c0c1->filter());
+  auto* c0c2 = scanSpec->childByName("c0")->childByName("c0c2");
+  ASSERT_FALSE(c0c2->isConstant());
+  ASSERT_TRUE(c0c2->filter());
+  ASSERT_TRUE(c0c2->childByName("c0c2c0")->isConstant());
+  auto* c0c3 = scanSpec->childByName("c0")->childByName("c0c3");
+  ASSERT_FALSE(c0c3->isConstant());
+  ASSERT_TRUE(c0c3->filter());
+  ASSERT_FALSE(c0c3->childByName("c0c3c0")->isConstant());
+}
+
+TEST_F(HiveConnectorTest, makeScanSpec_duplicateSubfields) {
+  auto c0Type = MAP(BIGINT(), MAP(BIGINT(), BIGINT()));
+  auto c1Type = MAP(VARCHAR(), MAP(BIGINT(), BIGINT()));
+  auto rowType = ROW({{"c0", c0Type}, {"c1", c1Type}});
+  std::shared_ptr<HiveColumnHandle> columnHandles[] = {
+      makeColumnHandle("c0", c0Type, {"c0[10][1]", "c0[10][2]"}),
+      makeColumnHandle("c1", c1Type, {"c1[\"foo\"][1]", "c1[\"foo\"][2]"}),
+  };
+  auto scanSpec = HiveDataSource::makeScanSpec(
+      {},
+      rowType,
+      {columnHandles[0].get(), columnHandles[1].get()},
+      {},
+      pool_.get());
+  auto* c0 = scanSpec->childByName("c0");
+  ASSERT_EQ(c0->children().size(), 2);
+  auto* c1 = scanSpec->childByName("c1");
+  ASSERT_EQ(c1->children().size(), 2);
 }
 
 TEST_F(HiveConnectorTest, extractFiltersFromRemainingFilter) {
diff --git a/velox/core/CMakeLists.txt b/velox/core/CMakeLists.txt
index 86d171485ed9..f4db34c8f382 100644
--- a/velox/core/CMakeLists.txt
+++ b/velox/core/CMakeLists.txt
@@ -23,6 +23,7 @@ add_library(velox_core Expressions.cpp PlanFragment.cpp PlanNode.cpp
 
 target_link_libraries(
   velox_core
+  velox_caching
   velox_config
   velox_expression_functions
   velox_type
diff --git a/velox/core/PlanNode.cpp b/velox/core/PlanNode.cpp
index b9c336c43fab..41ab55952cfb 100644
--- a/velox/core/PlanNode.cpp
+++ b/velox/core/PlanNode.cpp
@@ -1346,19 +1346,29 @@ RowTypePtr getRowNumberOutputType(
 
   return ROW(std::move(names), std::move(types));
 }
+
+RowTypePtr getOptionalRowNumberOutputType(
+    const RowTypePtr& inputType,
+    const std::optional<std::string>& rowNumberColumnName) {
+  if (rowNumberColumnName) {
+    return getRowNumberOutputType(inputType, rowNumberColumnName.value());
+  }
+
+  return inputType;
+}
 } // namespace
 
 RowNumberNode::RowNumberNode(
     PlanNodeId id,
     std::vector<FieldAccessTypedExprPtr> partitionKeys,
-    const std::string& rowNumberColumnName,
+    const std::optional<std::string>& rowNumberColumnName,
     std::optional<int32_t> limit,
     PlanNodePtr source)
     : PlanNode(std::move(id)),
       partitionKeys_{std::move(partitionKeys)},
       limit_{limit},
       sources_{std::move(source)},
-      outputType_(getRowNumberOutputType(
+      outputType_(getOptionalRowNumberOutputType(
           sources_[0]->outputType(),
           rowNumberColumnName)) {}
 
@@ -1380,7 +1390,9 @@ void RowNumberNode::addDetails(std::stringstream& stream) const {
 folly::dynamic RowNumberNode::serialize() const {
   auto obj = PlanNode::serialize();
   obj["partitionKeys"] = ISerializable::serialize(partitionKeys_);
-  obj["rowNumberColumnName"] = outputType_->names().back();
+  if (generateRowNumber()) {
+    obj["rowNumberColumnName"] = outputType_->names().back();
+  }
   if (limit_) {
     obj["limit"] = limit_.value();
   }
@@ -1398,26 +1410,19 @@ PlanNodePtr RowNumberNode::create(const folly::dynamic& obj, void* context) {
     limit = obj["limit"].asInt();
   }
 
+  std::optional<std::string> rowNumberColumnName;
+  if (obj.count("rowNumberColumnName")) {
+    rowNumberColumnName = obj["rowNumberColumnName"].asString();
+  }
+
   return std::make_shared<RowNumberNode>(
       deserializePlanNodeId(obj),
       partitionKeys,
-      obj["rowNumberColumnName"].asString(),
+      rowNumberColumnName,
       limit,
       source);
 }
 
-namespace {
-RowTypePtr getTopNRowNumberOutputType(
-    const RowTypePtr& inputType,
-    const std::optional<std::string>& rowNumberColumnName) {
-  if (rowNumberColumnName) {
-    return getRowNumberOutputType(inputType, rowNumberColumnName.value());
-  }
-
-  return inputType;
-}
-} // namespace
-
 TopNRowNumberNode::TopNRowNumberNode(
     PlanNodeId id,
     std::vector<FieldAccessTypedExprPtr> partitionKeys,
@@ -1432,7 +1437,7 @@ TopNRowNumberNode::TopNRowNumberNode(
       sortingOrders_{std::move(sortingOrders)},
       limit_{limit},
       sources_{std::move(source)},
-      outputType_{getTopNRowNumberOutputType(
+      outputType_{getOptionalRowNumberOutputType(
           sources_[0]->outputType(),
           rowNumberColumnName)} {
   VELOX_USER_CHECK_EQ(
@@ -1755,9 +1760,9 @@ PlanNodePtr PartitionedOutputNode::create(
     void* context) {
   return std::make_shared<PartitionedOutputNode>(
       deserializePlanNodeId(obj),
+      stringToKind(obj["kind"].asString()),
       ISerializable::deserialize<std::vector<ITypedExpr>>(obj["keys"], context),
       obj["numPartitions"].asInt(),
-      stringToKind(obj["kind"].asString()),
       obj["replicateNullsAndAny"].asBool(),
       ISerializable::deserialize<PartitionFunctionSpec>(
           obj["partitionFunctionSpec"], context),
diff --git a/velox/core/PlanNode.h b/velox/core/PlanNode.h
index 6e8d4dfdc5c4..4d3fedf2c469 100644
--- a/velox/core/PlanNode.h
+++ b/velox/core/PlanNode.h
@@ -1097,14 +1097,13 @@ class PartitionedOutputNode : public PlanNode {
       PlanNodePtr source)
       : PartitionedOutputNode(
             id,
+            broadcast ? Kind::kBroadcast : Kind::kPartitioned,
             keys,
             numPartitions,
-            broadcast ? Kind::kBroadcast : Kind::kPartitioned,
             replicateNullsAndAny,
             partitionFunctionSpec,
             outputType,
             source) {}
-#endif
 
   PartitionedOutputNode(
       const PlanNodeId& id,
@@ -1115,11 +1114,31 @@ class PartitionedOutputNode : public PlanNode {
       PartitionFunctionSpecPtr partitionFunctionSpec,
       RowTypePtr outputType,
       PlanNodePtr source)
+      : PartitionedOutputNode(
+            id,
+            kind,
+            keys,
+            numPartitions,
+            replicateNullsAndAny,
+            std::move(partitionFunctionSpec),
+            std::move(outputType),
+            std::move(source)) {}
+#endif
+
+  PartitionedOutputNode(
+      const PlanNodeId& id,
+      Kind kind,
+      const std::vector<TypedExprPtr>& keys,
+      int numPartitions,
+      bool replicateNullsAndAny,
+      PartitionFunctionSpecPtr partitionFunctionSpec,
+      RowTypePtr outputType,
+      PlanNodePtr source)
       : PlanNode(id),
+        kind_(kind),
         sources_{{std::move(source)}},
         keys_(keys),
         numPartitions_(numPartitions),
-        kind_(kind),
         replicateNullsAndAny_(replicateNullsAndAny),
         partitionFunctionSpec_(std::move(partitionFunctionSpec)),
         outputType_(std::move(outputType)) {
@@ -1129,10 +1148,11 @@ class PartitionedOutputNode : public PlanNode {
           keys_.empty(),
           "Non-empty partitioning keys require more than one partition");
     }
-    if (isBroadcast()) {
+    if (!isPartitioned()) {
       VELOX_CHECK(
           keys_.empty(),
-          "Broadcast partitioning doesn't allow for partitioning keys");
+          "{} partitioning doesn't allow for partitioning keys",
+          kindString(kind_));
     }
   }
 
@@ -1144,9 +1164,9 @@ class PartitionedOutputNode : public PlanNode {
     std::vector<TypedExprPtr> noKeys;
     return std::make_shared<PartitionedOutputNode>(
         id,
+        Kind::kBroadcast,
         noKeys,
         numPartitions,
-        Kind::kBroadcast,
         false,
         std::make_shared<GatherPartitionFunctionSpec>(),
         std::move(outputType),
@@ -1154,13 +1174,27 @@ class PartitionedOutputNode : public PlanNode {
   }
 
   static std::shared_ptr<PartitionedOutputNode>
-  single(const PlanNodeId& id, RowTypePtr outputType, PlanNodePtr source) {
+  arbitrary(const PlanNodeId& id, RowTypePtr outputType, PlanNodePtr source) {
     std::vector<TypedExprPtr> noKeys;
     return std::make_shared<PartitionedOutputNode>(
         id,
+        Kind::kArbitrary,
         noKeys,
         1,
+        false,
+        std::make_shared<GatherPartitionFunctionSpec>(),
+        std::move(outputType),
+        std::move(source));
+  }
+
+  static std::shared_ptr<PartitionedOutputNode>
+  single(const PlanNodeId& id, RowTypePtr outputType, PlanNodePtr source) {
+    std::vector<TypedExprPtr> noKeys;
+    return std::make_shared<PartitionedOutputNode>(
+        id,
         Kind::kPartitioned,
+        noKeys,
+        1,
         false,
         std::make_shared<GatherPartitionFunctionSpec>(),
         std::move(outputType),
@@ -1187,10 +1221,18 @@ class PartitionedOutputNode : public PlanNode {
     return numPartitions_;
   }
 
+  bool isPartitioned() const {
+    return kind_ == Kind::kPartitioned;
+  }
+
   bool isBroadcast() const {
     return kind_ == Kind::kBroadcast;
   }
 
+  bool isArbitrary() const {
+    return kind_ == Kind::kArbitrary;
+  }
+
   Kind kind() const {
     return kind_;
   }
@@ -1222,10 +1264,10 @@ class PartitionedOutputNode : public PlanNode {
  private:
   void addDetails(std::stringstream& stream) const override;
 
+  const Kind kind_;
   const std::vector<PlanNodePtr> sources_;
   const std::vector<TypedExprPtr> keys_;
   const int numPartitions_;
-  const Kind kind_;
   const bool replicateNullsAndAny_;
   const PartitionFunctionSpecPtr partitionFunctionSpec_;
   const RowTypePtr outputType_;
@@ -2066,19 +2108,21 @@ class WindowNode : public PlanNode {
 
 /// Optimized version of a WindowNode for a single row_number function with an
 /// optional limit and no sorting.
-/// The output of this node contains all input columns followed by a
+/// The output of this node contains all input columns followed by an optional
 /// 'rowNumberColumnName' BIGINT column.
 class RowNumberNode : public PlanNode {
  public:
   /// @param partitionKeys Partitioning keys. May be empty.
-  /// @param rowNumberColumnName Name of the column containing row numbers.
+  /// @param rowNumberColumnName Optional name of the column containing row
+  /// numbers. If not specified, the output doesn't include 'row number' column.
+  /// This is used when computing partial results.
   /// @param limit Optional per-partition limit. If specified, the number of
   /// rows produced by this node will not exceed this value for any given
   /// partition. Extra rows will be dropped.
   RowNumberNode(
       PlanNodeId id,
       std::vector<FieldAccessTypedExprPtr> partitionKeys,
-      const std::string& rowNumberColumnName,
+      const std::optional<std::string>& rowNumberColumnName,
       std::optional<int32_t> limit,
       PlanNodePtr source);
 
@@ -2098,6 +2142,10 @@ class RowNumberNode : public PlanNode {
     return limit_;
   }
 
+  bool generateRowNumber() const {
+    return outputType_->size() > sources_[0]->outputType()->size();
+  }
+
   std::string_view name() const override {
     return "RowNumber";
   }
diff --git a/velox/core/QueryCtx.cpp b/velox/core/QueryCtx.cpp
index cb06600b2fb5..2bfa52f6d9f1 100644
--- a/velox/core/QueryCtx.cpp
+++ b/velox/core/QueryCtx.cpp
@@ -21,13 +21,13 @@ QueryCtx::QueryCtx(
     folly::Executor* executor,
     std::unordered_map<std::string, std::string> queryConfigValues,
     std::unordered_map<std::string, std::shared_ptr<Config>> connectorConfigs,
-    memory::MemoryAllocator* allocator,
+    cache::AsyncDataCache* cache,
     std::shared_ptr<memory::MemoryPool> pool,
     std::shared_ptr<folly::Executor> spillExecutor,
     const std::string& queryId)
     : queryId_(queryId),
       connectorConfigs_(connectorConfigs),
-      allocator_(allocator),
+      cache_(cache),
       pool_(std::move(pool)),
       executor_(executor),
       queryConfig_{std::move(queryConfigValues)},
@@ -39,12 +39,12 @@ QueryCtx::QueryCtx(
     folly::Executor::KeepAlive<> executorKeepalive,
     std::unordered_map<std::string, std::string> queryConfigValues,
     std::unordered_map<std::string, std::shared_ptr<Config>> connectorConfigs,
-    memory::MemoryAllocator* allocator,
+    cache::AsyncDataCache* cache,
     std::shared_ptr<memory::MemoryPool> pool,
     const std::string& queryId)
     : queryId_(queryId),
       connectorConfigs_(connectorConfigs),
-      allocator_(allocator),
+      cache_(cache),
       pool_(std::move(pool)),
       executorKeepalive_(std::move(executorKeepalive)),
       queryConfig_{std::move(queryConfigValues)} {
diff --git a/velox/core/QueryCtx.h b/velox/core/QueryCtx.h
index d4f50874a998..7b0f178b5c75 100644
--- a/velox/core/QueryCtx.h
+++ b/velox/core/QueryCtx.h
@@ -17,8 +17,8 @@
 
 #include <folly/Executor.h>
 #include <folly/executors/CPUThreadPoolExecutor.h>
+#include "velox/common/caching/AsyncDataCache.h"
 #include "velox/common/memory/Memory.h"
-#include "velox/common/memory/MemoryAllocator.h"
 #include "velox/core/QueryConfig.h"
 #include "velox/vector/DecodedVector.h"
 #include "velox/vector/VectorPool.h"
@@ -39,8 +39,7 @@ class QueryCtx {
       std::unordered_map<std::string, std::string> queryConfigValues = {},
       std::unordered_map<std::string, std::shared_ptr<Config>>
           connectorConfigs = {},
-      memory::MemoryAllocator* allocator =
-          memory::MemoryAllocator::getInstance(),
+      cache::AsyncDataCache* cache = cache::AsyncDataCache::getInstance(),
       std::shared_ptr<memory::MemoryPool> pool = nullptr,
       std::shared_ptr<folly::Executor> spillExecutor = nullptr,
       const std::string& queryId = "");
@@ -54,8 +53,7 @@ class QueryCtx {
       std::unordered_map<std::string, std::string> queryConfigValues = {},
       std::unordered_map<std::string, std::shared_ptr<Config>>
           connectorConfigs = {},
-      memory::MemoryAllocator* allocator =
-          memory::MemoryAllocator::getInstance(),
+      cache::AsyncDataCache* cache = cache::AsyncDataCache::getInstance(),
       std::shared_ptr<memory::MemoryPool> pool = nullptr,
       const std::string& queryId = "");
 
@@ -65,8 +63,8 @@ class QueryCtx {
     return pool_.get();
   }
 
-  memory::MemoryAllocator* allocator() const {
-    return allocator_;
+  cache::AsyncDataCache* cache() const {
+    return cache_;
   }
 
   folly::Executor* executor() const {
@@ -135,7 +133,7 @@ class QueryCtx {
   const std::string queryId_;
 
   std::unordered_map<std::string, std::shared_ptr<Config>> connectorConfigs_;
-  memory::MemoryAllocator* allocator_;
+  cache::AsyncDataCache* cache_;
   std::shared_ptr<memory::MemoryPool> pool_;
   folly::Executor* executor_;
   folly::Executor::KeepAlive<> executorKeepalive_;
diff --git a/velox/docs/configs.rst b/velox/docs/configs.rst
index bbb595e0d185..0d1fa0c93d32 100644
--- a/velox/docs/configs.rst
+++ b/velox/docs/configs.rst
@@ -286,7 +286,7 @@ Hive Connector
      - bool
      - false
      - True if reading the source file column names as lower case, and planner should guarantee
-     - the input column name and filter is also lower case to achive case-insensitive read..    
+       the input column name and filter is also lower case to achive case-insensitive read.
    * - max-coalesced-bytes
      - integer
      - 512KB
@@ -327,7 +327,7 @@ Hive Connector
      - bool
      - false
      - Use path-style access for all requests to the S3-compatible storage. This is for S3-compatible storage that
-       doesn’t support virtual-hosted-style access.
+       doesn't support virtual-hosted-style access.
    * - hive.s3.ssl.enabled
      - bool
      - true
diff --git a/velox/docs/develop.rst b/velox/docs/develop.rst
index 4f132d68a914..35bac5e81730 100644
--- a/velox/docs/develop.rst
+++ b/velox/docs/develop.rst
@@ -22,7 +22,7 @@ This guide is intended for Velox contributors and developers of Velox-based appl
     develop/task
     develop/simd
     develop/spilling
-    develop/unsaferow
+    develop/serde
     develop/testing
     develop/debugging
     develop/TpchBenchmark
diff --git a/velox/docs/develop/aggregate-functions.rst b/velox/docs/develop/aggregate-functions.rst
index 234d230704c8..2ec4bc4be95e 100644
--- a/velox/docs/develop/aggregate-functions.rst
+++ b/velox/docs/develop/aggregate-functions.rst
@@ -583,27 +583,293 @@ You can see the documentation for all functions at :doc:`../functions/presto/agg
 Accumulator
 -----------
 
-Variable-width accumulators need to use :doc:`HashStringAllocator <arena>` to allocate memory. An instance of the allocator is available in the base class: *velox::exec::Aggregate::allocator_*.
+In Velox, efficient use of memory is a priority. This includes both optimizing
+the total amount of memory used as well as the number of memory allocations.
+Note that runtime statistics reported by Velox include both peak memory usage
+(in bytes) and number of memory allocations for each operator.
 
-Sometimes you’ll need to create a custom accumulator. Sometimes one of the existing accumulators would do the jobs.
+Aggregate functions use memory to store intermediate results in the
+accumulators. They allocate memory from an arena (:doc:`HashStringAllocator <arena>` class).
 
-SingleValueAccumulator used by :func:`min`, :func:`max` and :func:`arbitrary` functions can be used to store a single value of variable-width type, e.g. string, array, map or struct.
+array_agg and ValueList
+~~~~~~~~~~~~~~~~~~~~~~~
 
-ValueList accumulator used by :func:`array_agg` and :func:`map_agg` accumulates a list of values. This is an append-only accumulator.
+StlAllocator is an STL-compatible allocator backed by HashStringAllocator that
+can be used with STL containers. For example, one can define an std::vector
+that allocates memory from the arena like so:
 
-An StlAllocator defined in velox/exec/HashStringAllocator.h can be used to make STL containers (e.g. std::vector) backed by memory allocated via the HashStringAllocator. StlAllocator is not an accumulator itself, but can be used to design accumulators that use STL containers. It is used by :func:`approx_percentile` and :func:`approx_distinct`.
+.. code-block:: c++
+
+	std::vector<int64_t, StlAllocator<int64_t>>
+
+This is used, for example, in 3-arg versions of :func:`min_by` and :func:`max_by` with
+fixed-width type inputs (e.g. integers).
+
+There is also an AlignedStlAllocator that provides aligned allocations from the
+arena and can be used with `F14 <https://engineering.fb.com/2019/04/25/developer-tools/f14/>`_
+containers which require 16-byte alignment. One can define an F14FastMap that
+allocates memory from the arena like so:
 
-Memory allocated from the HashStringAllocator needs to be released in the destroy() method. See velox/aggregates/ArrayAgg.cpp for an example.
 
 .. code-block:: c++
 
-      void destroy(folly::Range<char**> groups) override {
-        for (auto group : groups) {
-          if (auto header = value<ArrayAccumulator>(group)->elements.begin()) {
-            allocator_->free(header);
-          }
-        }
-      }
+   folly::F14FastMap<
+         int64_t,
+         double,
+         std::hash<int64_t>,
+         std::equal_to<int64_t>,
+         AlignedStlAllocator<std::pair<const int64_t, double>, 16>>
+
+You can find an example usage in :func:`histogram` aggregation function.
+
+An :func:`array_agg` function on primitive types could be implemented using
+std::vector<T>, but it would not be efficient. Why is that? If one doesn’t
+use ‘reserve’ method to provide a hint to std::vector about how many entries will be
+added, the default behavior is to allocate memory in powers of 2, e.g. first
+allocate 1 entry, then 2, then 4, 8, 16, etc. Every time new allocation is
+made the data is copied into the new memory buffer and the old buffer is
+released. One can see this by instrumenting StlAllocator::allocate and
+deallocate methods to add logging and run a simple loop to add elements to a
+vector:
+
+.. code-block:: c++
+
+   std::vector<double, StlAllocator<double>> data(
+      0, StlAllocator<double>(allocator_.get()));
+
+
+   for (auto i = 0; i < 100; ++i) {
+    data.push_back(i);
+   }
+
+
+.. code-block:: text
+
+   E20230714 14:57:33.717708 975289 HashStringAllocator.h:497] allocate 1
+   E20230714 14:57:33.734280 975289 HashStringAllocator.h:497] allocate 2
+   E20230714 14:57:33.734321 975289 HashStringAllocator.h:506] free 1
+   E20230714 14:57:33.734352 975289 HashStringAllocator.h:497] allocate 4
+   E20230714 14:57:33.734381 975289 HashStringAllocator.h:506] free 2
+   E20230714 14:57:33.734416 975289 HashStringAllocator.h:497] allocate 8
+   E20230714 14:57:33.734445 975289 HashStringAllocator.h:506] free 4
+   E20230714 14:57:33.734481 975289 HashStringAllocator.h:497] allocate 16
+   E20230714 14:57:33.734513 975289 HashStringAllocator.h:506] free 8
+   E20230714 14:57:33.734544 975289 HashStringAllocator.h:497] allocate 32
+   E20230714 14:57:33.734575 975289 HashStringAllocator.h:506] free 16
+   E20230714 14:57:33.734606 975289 HashStringAllocator.h:497] allocate 64
+   E20230714 14:57:33.734637 975289 HashStringAllocator.h:506] free 32
+   E20230714 14:57:33.734668 975289 HashStringAllocator.h:497] allocate 128
+   E20230714 14:57:33.734699 975289 HashStringAllocator.h:506] free 64
+   E20230714 14:57:33.734731 975289 HashStringAllocator.h:506] free 128
+
+
+Reallocating memory and copying data is not cheap. To avoid this overhead we
+introduced ValueList primitive and used it to implement array_agg.
+
+ValueList is an append-only data structure that allows appending values from any
+Velox Vector and reading values back into a Velox Vector. ValueList doesn’t
+require a contiguous chunk of memory and therefore doesn’t need to re-allocate
+and copy when it runs out of space. It just allocates another chunk and starts
+filling that up.
+
+ValueList is designed to work with data that comes from Velox Vectors, hence,
+its API is different from std::vector. You append values from a DecodedVector
+and read values back into a flat vector. Here is an example of usage:
+
+.. code-block:: c++
+
+   DecodedVector decoded(*data);
+
+   // Store data.
+   ValueList values;
+   for (auto i = 0; i < 100; ++i) {
+    values.appendValue(decoded, i, allocator());
+   }
+
+
+   // Read data back.
+   auto copy = BaseVector::create(DOUBLE(), 100, pool());
+   aggregate::ValueListReader reader(values);
+   for (auto i = 0; i < 100; ++i) {
+    reader.next(*copy, i);
+   }
+
+ValueList supports all types, so you can use it to append fixed-width values as
+well as strings, arrays, maps and structs.
+
+When storing complex types, ValueList serializes the values using
+ContainerRowSerde.
+
+ValueList preserves the null flags as well, so you can store a list of nullable
+values in it.
+
+The array_agg is implemented using ValueList for the accumulator.
+
+ValueList needs a pointer to the arena for appending data. It doesn’t take an
+arena in the constructor and doesn’t store it, because that would require 8
+bytes of memory per group in the aggregation operator. Instead,
+ValueList::appendValue method takes a pointer to the arena as an argument.
+Consequently, ValueList’s destructor cannot release the memory back to the
+arena and requires the user to explicitly call the free
+(HashStringAllocator*) method.
+
+min, max, and SingleValueAccumulator
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+:func:`min` and :func:`max` functions store a single value in the accumulator
+(the current min or max value). They use SingleValueAccumulator to store
+strings, arrays, maps and structs. When processing a new value, we compare
+it with the stored value and replace the stored value if necessary.
+
+Similar to ValueList, SingleValueAccumulator serializes the values using
+ContainerRowSerde. SingleValueAccumulator provides a compare method to compare
+stored value with a row of a DecodedVector.
+
+This accumulator is also used in the implementation of the :func:`arbitrary`
+aggregate function which stores the first value in the accumulator.
+
+set_agg, set_union, Strings and AddressableNonNullValueList
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+:func:`set_agg` function accumulates a set of unique values into an F14FastSet
+configured to allocate memory from the arena via AlignedStlAllocator.
+Fixed-width values are stored directly in F14FastSet. Memory allocation pattern
+for F14 data structures is similar to std::vector. F14 allocates memory in
+powers on 2, copies data and frees previously allocated memory. Hence, we do
+not store strings directly in the F14 set. Instead, Velox writes strings into
+the arena and stores a StringView pointing to the arena in the set.
+
+In general, when writing to the arena, one is not guaranteed a contiguous write.
+However, for StringViews to work we must ensure that strings written into the
+arena are contiguous. Strings helper class provides this functionality. Its
+append method takes a StringView and a pointer to the arena, copies the string
+into the arena and returns a StringView pointing to the copy.
+
+.. code-block:: c++
+
+   /// Copies the string into contiguous memory allocated via
+   /// HashStringAllocator. Returns StringView over the copy.
+   StringView append(StringView value, HashStringAllocator& allocator);
+
+Strings class provides a free method to release memory back to the arena.
+
+.. code-block:: c++
+
+   /// Frees memory used by the strings. StringViews returned from 'append'
+   /// become invalid after this call.
+   void free(HashStringAllocator& allocator);
+
+When aggregating complex types (arrays, maps or structs), we use
+AddressableNonNullValueList which writes values to the arena and returns
+a “pointer” to the written value which we store in the F14 set.
+AddressableNonNullValueList provides methods to compute a hash of a value and
+compare two values. AddressableNonNullValueList uses ContainerRowSerde for
+serializing data and comparing serialized values.
+
+
+.. code-block:: c++
+
+   /// A set of pointers to values stored in AddressableNonNullValueList.
+   SetAccumulator<
+      HashStringAllocator::Position,
+      AddressableNonNullValueList::Hash,
+      AddressableNonNullValueList::EqualTo>
+      base;
+
+AddressableNonNullValueList allows to append a value and erase the last value.
+This functionality is sufficient for set_agg and set_union. When processing a
+new value, we append it to the list, get a “pointer”, insert that “pointer”
+into F14 set and if the “pointer” points to a duplicate value we remove it from
+the list.
+
+Like all other arena-based accumulators, AddressableNonNullValueList provides a
+free method to return memory back to the arena.
+
+Note: AddressableNonNullValueList is different from ValueList in that it
+provides access to individual values (hence, the “Addressable” prefix in the
+name) while ValueList does not. With ValueList one can append values, then copy
+all the values into a Vector. Adhoc access to individual elements is not
+available in ValueList.
+
+SetAccumulator<T> template implements a simple interface to accumulate unique
+values. It is implemented using F14FastSet, Strings and
+AddressableNonNullValueList. T can be a fixed-width type like int32_t or
+int64_t, StringView or ComplexType.
+
+addValue and addValues method allow to add one or multiple values from a vector.
+
+.. code-block:: c++
+
+   /// Adds value if new. No-op if the value was added before.
+   void addValue(
+      const DecodedVector& decoded,
+      vector_size_t index,
+      HashStringAllocator* allocator)/// Adds new values from an array.
+
+   void addValues(
+      const ArrayVector& arrayVector,
+      vector_size_t index,
+      const DecodedVector& values,
+      HashStringAllocator* allocator)
+
+size() method returns the number of unique values.
+
+.. code-block:: c++
+
+   /// Returns number of unique values including null.
+   size_t size() const
+
+extractValues method allows to extract unique values into a vector.
+
+.. code-block:: c++
+
+   /// Copies the unique values and null into the specified vector starting at
+   /// the specified offset.
+   vector_size_t extractValues(FlatVector<T>& values, vector_size_t offset)
+
+   /// For complex types.
+   vector_size_t extractValues(BaseVector& values, vector_size_t offset)
+
+Both :func:`set_agg` and :func:`set_union` functions are implemented using
+SetAccumulator.
+
+map_agg, map_union, and MapAccumulator
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+:func:`map_agg` function accumulates keys and values into a map. It discards
+duplicate keys and keeps only one value for each unique key. Map_agg uses
+MapAccumulator<T> template to accumulate the values. Similar to SetAccumulator,
+MapAccumulator is built using F14FastMap, AlignedStlAllocator, Strings and
+AddressableNonNullValueList.
+
+insert() method adds a pair of (key, value) to the map discarding the value if matching key already exists.
+
+.. code-block:: c++
+
+   /// Adds key-value pair if entry with that key doesn't exist yet.
+   void insert(
+      const DecodedVector& decodedKeys,
+      const DecodedVector& decodedValues,
+      vector_size_t index,
+      HashStringAllocator& allocator)
+
+size() method returns the number of unique values.
+
+extract() method copies the keys and the values into vectors, which can be combined to form a MapVector.
+
+.. code-block:: c++
+
+   void extract(
+      const VectorPtr& mapKeys,
+      const VectorPtr& mapValues,
+      vector_size_t offset)
+
+Both :func:`map_agg` and :func:`map_union` functions are implemented using
+MapAccumulator.
+
+When implementing new aggregate functions, consider using ValueList,
+SingleValueAccumulator, Strings, AddressableNonNullValueList and F14
+containers to put together an accumulator that uses memory efficiently.
 
 End-to-End Testing
 ------------------
diff --git a/velox/docs/develop/operators.rst b/velox/docs/develop/operators.rst
index 3dc7ab1649a7..18cbadab9152 100644
--- a/velox/docs/develop/operators.rst
+++ b/velox/docs/develop/operators.rst
@@ -89,7 +89,7 @@ with HiveConnector, table scan reads data from ORC or Parquet files.
 ArrowStreamNode
 ~~~~~~~~~~~~~~~
 
-The Arrow stream operation reads data from an Arrow array stream. The ArrowArrayStream structure is defined in Arrow abi, 
+The Arrow stream operation reads data from an Arrow array stream. The ArrowArrayStream structure is defined in Arrow abi,
 and provides the required callbacks to interact with a streaming source of Arrow arrays.
 
 .. list-table::
@@ -583,8 +583,8 @@ each batch of input it computes and returns the results before accepting the
 next batch of input.
 
 This operator accumulates state: a hash table mapping partition keys to total
-number of rows seen in this partition so far. This operator doesn't support
-spilling yet.
+number of rows seen in this partition so far. Returning the row numbers as
+a column in the output is optional. This operator doesn't support spilling yet.
 
 This operator is equivalent to a WindowNode followed by
 FilterNode(row_number <= limit), but it uses less memory and CPU and makes
@@ -600,7 +600,7 @@ results available before seeing all input.
   * - partitionKeys
     - Partition by columns.
   * - rowNumberColumnName
-    - Output column name for the row numbers.
+    - Optional output column name for the row numbers. If specified, the generated row numbers are returned as an output column appearing after all input columns.
   * - limit
     - Optional per-partition limit. If specified, the number of rows produced by this node will not exceed this value for any given partition. Extra rows will be dropped.
 
@@ -615,8 +615,8 @@ a 'limit' number of top rows for each partition. After receiving all input,
 assigns row numbers within each partition starting from 1.
 
 This operator accumulates state: a hash table mapping partition keys to a list
-of top 'limit' rows within that partition. This operator doesn't support
-spilling yet.
+of top 'limit' rows within that partition.  Returning the row numbers as
+a column in the output is optional. This operator doesn't support spilling yet.
 
 This operator is logically equivalent to a WindowNode followed by
 FilterNode(row_number <= limit), but it uses less memory and CPU.
@@ -635,7 +635,7 @@ FilterNode(row_number <= limit), but it uses less memory and CPU.
   * - sortingOrders
     - Sorting order for each sorting key above. The supported sort orders are asc nulls first, asc nulls last, desc nulls first and desc nulls last.
   * - rowNumberColumnName
-    - Output column name for the row numbers.
+    - Optional output column name for the row numbers. If specified, the generated row numbers are returned as an output column appearing after all input columns.
   * - limit
     - Per-partition limit. If specified, the number of rows produced by this node will not exceed this value for any given partition. Extra rows will be dropped.
 
diff --git a/velox/docs/develop/serde.rst b/velox/docs/develop/serde.rst
new file mode 100644
index 000000000000..8b367c5baaf8
--- /dev/null
+++ b/velox/docs/develop/serde.rst
@@ -0,0 +1,29 @@
+*********************
+Serialization Formats
+*********************
+
+Velox supports three data serialization formats that can be used for data shuffle:
+`PrestoPage <https://prestodb.io/docs/current/develop/serialized-page.html>`_,
+UnsafeRow and CompactRow. PrestoPage is a columnar format. UnsafeRow and CompactRow
+are row-wise formats.
+
+Velox applications can register their own formats as well.
+
+PrestoPage format is described in the `Presto documentation <https://prestodb.io/docs/current/develop/serialized-page.html>`_.
+
+UnsafeRow format comes from `Apache Spark <https://jaceklaskowski.gitbooks.io/mastering-spark-sql/content/spark-sql-UnsafeRow.html>`_.
+
+CompactRow is similar to UnsafeRow, but it is more space efficient and results in
+fewer bytes shuffled which has a cascading effect on CPU usage (for compression
+and checksumming) and memory (for buffering).
+
+The details of UnsafeRow and CompactRow formats can be found in the following articles.
+
+.. toctree::
+    :maxdepth: 1
+
+    serde/unsaferow
+    serde/compactrow
+
+Velox also uses another row-wise serialization format, ContainerRowSerde, for storing
+data in aggregation and join operators. This format is similar to CompactRow.
diff --git a/velox/docs/develop/serde/compactrow.rst b/velox/docs/develop/serde/compactrow.rst
new file mode 100644
index 000000000000..bb40c909b87e
--- /dev/null
+++ b/velox/docs/develop/serde/compactrow.rst
@@ -0,0 +1,118 @@
+==========
+CompactRow
+==========
+
+CompactRow is a row-wise serialization format provided by Velox as an
+alternative to UnsafeRow format. CompactRow is more space efficient then
+UnsafeRow and results in fewer bytes shuffled which has a cascading effect on
+CPU usage (for compression and checksumming) and memory (for buffering).
+
+A row is a contiguous buffer that starts with null flags, followed by individual
+fields.
+
+nulls | field1 | field 2 | …
+
+Nulls section uses one bit per field to indicate which fields are null. If there
+are 10 fields, there will be 2 bytes of null flags (16 bits total, 10 bits
+used, 6 bits unused).
+
+Fixed-width fields (integers, boolean, floating point numbers) take up a fixed
+number of bytes regardless of whether they are null or not. A row with 10
+bigint fields takes up 2 + 10 * 8 = 82 bytes. 2 bytes for null flags + 8 bytes
+per field.
+
+The sizes of fixed-width fields are:
+
+================   ==============================================
+Type               Number of bytes used for serialization
+================   ==============================================
+BOOLEAN            1
+TINYINT            1
+SMALLINT           2
+INTEGER            4
+BIGINT             8
+HUGEINT            16
+REAL               4
+DOUBLE             8
+TIMESTAMP          8
+UNKNOWN            0
+================   ==============================================
+
+Strings (VARCHAR and VARBINARY) use 4 bytes for size plus the length of the
+string. Empty string uses 4 bytes. 1-character string uses 5 bytes.
+20-character ASCII string uses 24 bytes. Null strings do not take up space
+(other than one bit in the nulls section).
+
+Arrays of fixed-width values or strings, e.g. arrays of integers, use 4 bytes
+for the size of the array, a few bytes for nulls flags indicating null-ness of
+the elements (1 bit per element) plus the space taken by the elements
+themselves.
+
+For example, an array of 5 integers [1, 2, 3, 4, 5] uses 4 bytes for size, 1
+byte for 5 null flags and 5 * 4 bytes for 5 values. A total of 25 bytes.
+
+
+============    ====    ========    ======  ======  ======  ======  ======
+Description     Size    Nulls       Elem 1  Elem 2  Elem 3  Elem 4  Elem 5
+============    ====    ========    ======  ======  ======  ======  ======
+# of bytes      4       1           4       4       4       4       4
+Value           5       00000000    1       2       3       4       5
+============    ====    ========    ======  ======  ======  ======  ======
+
+An array of 4 strings [null, “Abc”, null, “Mountains and rivers”] uses 36 bytes:
+
+============    ====    ========    =======     ======  =======     =====================
+Description     Size    Nulls       Size s2     s2      Size s4     s4
+============    ====    ========    =======     ======  =======     =====================
+# of bytes      4       1           4           3       4           20
+Value           4       10100000    1           Abc     20          Mountains and rivers
+============    ====    ========    =======     ======  =======     =====================
+
+Serialization of an array of complex type elements, e.g. an array of arrays, maps or structs, includes a few additional fields: the total serialized size plus offset of each element in the serialized buffer.
+
+- 4 bytes - array size.
+- N bytes - null flags, 1 bit per element.
+- 4 bytes - Total serialized size of the array excluding first 2 fields (size and nulls).
+- 4 bytes per element - Offsets of the elements in the serialized buffer relative to the position right after the total serialized size.
+- Elements.
+
+For example, an array of integers [[1, 2, 3], [4, 5], [6]] uses N bytes:
+
+- 4 bytes - size - 3
+- 1 byte - nulls - 00000000
+- 4 bytes - total serialized size - 55
+- 4 bytes - offset of the 1st element - 12
+- 4 bytes - offset of the 2nd element - 29
+- 4 bytes - offset of the 3rd element - 42
+- —-- Start of the 1st element: [1, 2, 3]
+- 4 bytes - size - 3
+- 1 byte - nulls - 00000000
+- 4 bytes - element 1 - 1
+- 4 bytes - element 2 - 2
+- 4 bytes - element 3 - 3
+- —-- Start of the 2nd element: [4, 5]
+- 4 bytes - size - 2
+- 1 byte - nulls - 00000000
+- 4 bytes - element 1 - 4
+- 4 bytes - element 2 - 5
+- —-- Start of the 2nd element: [6]
+- 4 bytes - size - 1
+- 1 byte - nulls - 00000000
+- 4 bytes - element 1 - 6
+
+A map is serialized as the keys array followed by the values array.
+
+A struct is serialized the same as the top-level row.
+
+Compared to UnsafeRow, on average CompactRow serialization is about twice shorter. Some examples are:
+
+======================  =========   ==========
+Type                    UnsafeRow   CompactRow
+======================  =========   ==========
+INTEGER                 8           4
+BIGINT                  8           8
+REAL                    8           4
+DOUBLE                  8           8
+VARCHAR: “” (empty)     8           4
+VARCHAR: “Abc”          16          7
+======================  =========   ==========
diff --git a/velox/docs/develop/unsaferow.rst b/velox/docs/develop/serde/unsaferow.rst
similarity index 98%
rename from velox/docs/develop/unsaferow.rst
rename to velox/docs/develop/serde/unsaferow.rst
index c73b19555631..b64b44e5f0b2 100644
--- a/velox/docs/develop/unsaferow.rst
+++ b/velox/docs/develop/serde/unsaferow.rst
@@ -1,6 +1,6 @@
-==============================
-UnsafeRow Serialization Format
-==============================
+=========
+UnsafeRow
+=========
 
 Velox supports two data serialization formats out of the box:
 `PrestoPage <https://prestodb.io/docs/current/develop/serialized-page.html>`_
diff --git a/velox/docs/functions/presto/math.rst b/velox/docs/functions/presto/math.rst
index beea7aff0acc..6ddf0c2e3f95 100644
--- a/velox/docs/functions/presto/math.rst
+++ b/velox/docs/functions/presto/math.rst
@@ -260,6 +260,16 @@ Probability Functions: cdf
     Compute the Cauchy cdf with given parameters median and scale (gamma): P(N; median, scale).
     The scale parameter must be a positive double. The value parameter must be a double on the interval [0, 1].
 
+.. function:: chi_squared_cdf(df, value) -> double
+
+    Compute the Chi-square cdf with given df (degrees of freedom) parameter:  P(N < value; df).
+    The df parameter must be a positive real number, and value must be a non-negative real value (both of type DOUBLE).
+
+.. function:: f_cdf(df1, df2, value) -> double
+
+    Compute the F cdf with given df1 (numerator degrees of freedom) and df2 (denominator degrees of freedom) parameters:  P(N < value; df1, df2).
+    The numerator and denominator df parameters must be positive real numbers. The value must be a non-negative real number.
+
 .. function:: normal_cdf(mean, sd, value) -> double
 
     Compute the Normal cdf with given mean and standard deviation (sd): P(N < value; mean, sd).
diff --git a/velox/docs/functions/spark/string.rst b/velox/docs/functions/spark/string.rst
index f3ecc78a688f..c1f433a53ece 100644
--- a/velox/docs/functions/spark/string.rst
+++ b/velox/docs/functions/spark/string.rst
@@ -11,6 +11,8 @@ Unless specified otherwise, all functions return NULL if at least one of the arg
 .. spark:function:: chr(n) -> varchar
 
     Returns the Unicode code point ``n`` as a single character string.
+    If ``n < 0``, the result is an empty string.
+    If ``n >= 256``, the result is equivalent to chr(``n % 256``).
 
 .. spark:function:: contains(left, right) -> boolean
 
diff --git a/velox/duckdb/CMakeLists.txt b/velox/duckdb/CMakeLists.txt
index dabc52b1fc97..eee20a7f868f 100644
--- a/velox/duckdb/CMakeLists.txt
+++ b/velox/duckdb/CMakeLists.txt
@@ -12,4 +12,3 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 add_subdirectory(conversion)
-add_subdirectory(memory)
diff --git a/velox/duckdb/memory/Allocator.cpp b/velox/duckdb/memory/Allocator.cpp
deleted file mode 100644
index 3f473de5c012..000000000000
--- a/velox/duckdb/memory/Allocator.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "velox/duckdb/memory/Allocator.h"
-
-namespace facebook::velox::duckdb {
-
-::duckdb::data_ptr_t veloxPoolAllocate(
-    ::duckdb::PrivateAllocatorData* privateData,
-    ::duckdb::idx_t size) {
-  auto veloxPrivateData = dynamic_cast<PrivateVeloxAllocatorData*>(privateData);
-  VELOX_CHECK(veloxPrivateData);
-  return static_cast<::duckdb::data_ptr_t>(
-      veloxPrivateData->pool.allocate(size));
-}
-
-void veloxPoolFree(
-    ::duckdb::PrivateAllocatorData* privateData,
-    ::duckdb::data_ptr_t pointer,
-    ::duckdb::idx_t size) {
-  auto veloxPrivateData = dynamic_cast<PrivateVeloxAllocatorData*>(privateData);
-  VELOX_CHECK(veloxPrivateData);
-  veloxPrivateData->pool.free(pointer, size);
-}
-
-::duckdb::data_ptr_t veloxPoolReallocate(
-    ::duckdb::PrivateAllocatorData* privateData,
-    ::duckdb::data_ptr_t pointer,
-    ::duckdb::idx_t oldSize,
-    ::duckdb::idx_t size) {
-  auto veloxPrivateData = dynamic_cast<PrivateVeloxAllocatorData*>(privateData);
-  VELOX_CHECK(veloxPrivateData);
-  return static_cast<::duckdb::data_ptr_t>(
-      veloxPrivateData->pool.reallocate(pointer, oldSize, size));
-}
-
-VeloxPoolAllocator& getDefaultAllocator() {
-  static std::shared_ptr<memory::MemoryPool> pool =
-      memory::addDefaultLeafMemoryPool("VeloxPoolAllocator");
-  static VeloxPoolAllocator allocator{*pool};
-  return allocator;
-}
-
-} // namespace facebook::velox::duckdb
diff --git a/velox/duckdb/memory/Allocator.h b/velox/duckdb/memory/Allocator.h
deleted file mode 100644
index 692a4ce582f1..000000000000
--- a/velox/duckdb/memory/Allocator.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "velox/common/memory/Memory.h"
-#include "velox/external/duckdb/duckdb.hpp"
-
-namespace facebook::velox::duckdb {
-
-struct PrivateVeloxAllocatorData : public ::duckdb::PrivateAllocatorData {
-  explicit PrivateVeloxAllocatorData(memory::MemoryPool& pool_) : pool(pool_) {}
-
-  ~PrivateVeloxAllocatorData() override {}
-
-  memory::MemoryPool& pool;
-};
-
-::duckdb::data_ptr_t veloxPoolAllocate(
-    ::duckdb::PrivateAllocatorData* privateData,
-    ::duckdb::idx_t size);
-
-void veloxPoolFree(
-    ::duckdb::PrivateAllocatorData* privateData,
-    ::duckdb::data_ptr_t pointer,
-    ::duckdb::idx_t size);
-
-::duckdb::data_ptr_t veloxPoolReallocate(
-    ::duckdb::PrivateAllocatorData* privateData,
-    ::duckdb::data_ptr_t pointer,
-    ::duckdb::idx_t oldSize,
-    ::duckdb::idx_t size);
-
-class VeloxPoolAllocator : public ::duckdb::Allocator {
- public:
-  explicit VeloxPoolAllocator(memory::MemoryPool& pool)
-      : ::duckdb::Allocator(
-            veloxPoolAllocate,
-            veloxPoolFree,
-            veloxPoolReallocate,
-            std::make_unique<PrivateVeloxAllocatorData>(pool)) {}
-};
-
-VeloxPoolAllocator& getDefaultAllocator();
-
-} // namespace facebook::velox::duckdb
diff --git a/velox/duckdb/memory/CMakeLists.txt b/velox/duckdb/memory/CMakeLists.txt
deleted file mode 100644
index e8f751418382..000000000000
--- a/velox/duckdb/memory/CMakeLists.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-add_library(velox_duckdb_allocator Allocator.cpp)
-
-target_link_libraries(velox_duckdb_allocator velox_dwio_common duckdb fmt::fmt)
-
-if(NOT VELOX_DISABLE_GOOGLETEST)
-  target_link_libraries(velox_duckdb_allocator gtest)
-endif()
diff --git a/velox/dwio/common/CMakeLists.txt b/velox/dwio/common/CMakeLists.txt
index 7f83771b3ea9..080d0f7826e2 100644
--- a/velox/dwio/common/CMakeLists.txt
+++ b/velox/dwio/common/CMakeLists.txt
@@ -20,8 +20,6 @@ elseif(${VELOX_BUILD_TEST_UTILS})
   add_subdirectory(tests/utils)
 endif()
 
-include_directories(/opt/homebrew/opt/protobuf/include)
-
 add_library(
   velox_dwio_common
   BitConcatenation.cpp
@@ -54,6 +52,8 @@ add_library(
   TypeWithId.cpp
   WriterFactory.cpp)
 
+target_include_directories(velox_dwio_common PRIVATE ${Protobuf_INCLUDE_DIRS})
+
 target_link_libraries(
   velox_dwio_common
   velox_buffer
diff --git a/velox/dwio/common/CachedBufferedInput.cpp b/velox/dwio/common/CachedBufferedInput.cpp
index 9c20aa580491..f8fbf33f68ea 100644
--- a/velox/dwio/common/CachedBufferedInput.cpp
+++ b/velox/dwio/common/CachedBufferedInput.cpp
@@ -87,8 +87,9 @@ bool CachedBufferedInput::shouldPreload(int32_t numPages) {
         memory::AllocationTraits::kPageSize;
   }
   auto cachePages = cache_->incrementCachedPages(0);
-  auto maxPages = memory::AllocationTraits::numPages(cache_->capacity());
-  auto allocatedPages = cache_->numAllocated();
+  auto allocator = cache_->allocator();
+  auto maxPages = memory::AllocationTraits::numPages(allocator->capacity());
+  auto allocatedPages = allocator->numAllocated();
   if (numPages < maxPages - allocatedPages) {
     // There is free space for the read-ahead.
     return true;
diff --git a/velox/dwio/common/SelectiveColumnReader.cpp b/velox/dwio/common/SelectiveColumnReader.cpp
index 78754bf37db7..4b6624245139 100644
--- a/velox/dwio/common/SelectiveColumnReader.cpp
+++ b/velox/dwio/common/SelectiveColumnReader.cpp
@@ -126,22 +126,27 @@ void SelectiveColumnReader::prepareNulls(
   simd::memset(rawResultNulls_, bits::kNotNullByte, resultNulls_->capacity());
 }
 
-bool SelectiveColumnReader::shouldMoveNulls(RowSet rows) {
-  if (rows.size() == numValues_) {
+const uint64_t* SelectiveColumnReader::shouldMoveNulls(RowSet rows) {
+  if (rows.size() == numValues_ || !anyNulls_) {
     // Nulls will only be moved if there is a selection on values. A cast
     // alone does not move nulls.
-    return false;
+    return nullptr;
   }
-  VELOX_CHECK(
-      !returnReaderNulls_,
-      "Do not return reader nulls if retrieving a subset of values");
-  if (anyNulls_) {
-    VELOX_CHECK(
-        resultNulls_ && resultNulls_->as<uint64_t>() == rawResultNulls_);
-    VELOX_CHECK_GT(resultNulls_->capacity() * 8, rows.size());
-    return true;
+  const uint64_t* moveFrom = rawResultNulls_;
+  if (returnReaderNulls_) {
+    if (!(resultNulls_ && resultNulls_->unique() &&
+          resultNulls_->capacity() >= rows.size() + simd::kPadding)) {
+      resultNulls_ = AlignedBuffer::allocate<bool>(
+          rows.size() + (simd::kPadding * 8), &memoryPool_);
+      rawResultNulls_ = resultNulls_->asMutable<uint64_t>();
+    }
+    moveFrom = nullsInReadRange_->as<uint64_t>();
+    bits::copyBits(moveFrom, 0, rawResultNulls_, 0, rows.size());
+    returnReaderNulls_ = false;
   }
-  return false;
+  VELOX_CHECK(resultNulls_ && resultNulls_->as<uint64_t>() == rawResultNulls_);
+  VELOX_CHECK_GT(resultNulls_->capacity() * 8, rows.size());
+  return moveFrom;
 }
 
 void SelectiveColumnReader::getIntValues(
@@ -257,7 +262,7 @@ void SelectiveColumnReader::compactScalarValues<bool, bool>(
   auto rawBits = reinterpret_cast<uint64_t*>(rawValues_);
   vector_size_t rowIndex = 0;
   auto nextRow = rows[rowIndex];
-  bool moveNulls = shouldMoveNulls(rows);
+  auto* moveNullsFrom = shouldMoveNulls(rows);
   for (size_t i = 0; i < numValues_; i++) {
     if (outputRows_[i] < nextRow) {
       continue;
@@ -266,9 +271,8 @@ void SelectiveColumnReader::compactScalarValues<bool, bool>(
     VELOX_DCHECK(outputRows_[i] == nextRow);
 
     bits::setBit(rawBits, rowIndex, bits::isBitSet(rawBits, i));
-    if (moveNulls && rowIndex != i) {
-      bits::setBit(
-          rawResultNulls_, rowIndex, bits::isBitSet(rawResultNulls_, i));
+    if (moveNullsFrom && rowIndex != i) {
+      bits::setBit(rawResultNulls_, rowIndex, bits::isBitSet(moveNullsFrom, i));
     }
     if (!isFinal) {
       outputRows_[rowIndex] = nextRow;
@@ -289,7 +293,7 @@ char* SelectiveColumnReader::copyStringValue(folly::StringPiece value) {
   if (stringBuffers_.empty() || rawStringUsed_ + size > rawStringSize_) {
     auto bytes = std::max(size, kStringBufferSize);
     BufferPtr buffer = AlignedBuffer::allocate<char>(bytes, &memoryPool_);
-    // Use the prefered size instead of the requested one to improve memory
+    // Use the preferred size instead of the requested one to improve memory
     // efficiency.
     buffer->setSize(buffer->capacity());
     stringBuffers_.push_back(buffer);
diff --git a/velox/dwio/common/SelectiveColumnReader.h b/velox/dwio/common/SelectiveColumnReader.h
index eca0f34fb931..90541b29483f 100644
--- a/velox/dwio/common/SelectiveColumnReader.h
+++ b/velox/dwio/common/SelectiveColumnReader.h
@@ -496,9 +496,10 @@ class SelectiveColumnReader {
   template <typename T, typename TVector>
   void upcastScalarValues(RowSet rows);
 
-  // Returns true if compactScalarValues and upcastScalarValues should
-  // move null flags. Checks consistency of nulls-related state.
-  bool shouldMoveNulls(RowSet rows);
+  // Return the source null bits if compactScalarValues and upcastScalarValues
+  // should move null flags.  Return nullptr if nulls does not need to be moved.
+  // Checks consistency of nulls-related state.
+  const uint64_t* shouldMoveNulls(RowSet rows);
 
   void addStringValue(folly::StringPiece value);
 
diff --git a/velox/dwio/common/SelectiveColumnReaderInternal.h b/velox/dwio/common/SelectiveColumnReaderInternal.h
index 1a8a69909077..0b3c0f37d599 100644
--- a/velox/dwio/common/SelectiveColumnReaderInternal.h
+++ b/velox/dwio/common/SelectiveColumnReaderInternal.h
@@ -183,7 +183,7 @@ void SelectiveColumnReader::upcastScalarValues(RowSet rows) {
   }
   vector_size_t rowIndex = 0;
   auto nextRow = rows[rowIndex];
-  bool moveNulls = shouldMoveNulls(rows);
+  auto* moveNullsFrom = shouldMoveNulls(rows);
   for (size_t i = 0; i < numValues_; i++) {
     if (sourceRows[i] < nextRow) {
       continue;
@@ -191,9 +191,8 @@ void SelectiveColumnReader::upcastScalarValues(RowSet rows) {
 
     VELOX_DCHECK(sourceRows[i] == nextRow);
     buf[rowIndex] = typedSourceValues[i];
-    if (moveNulls && rowIndex != i) {
-      bits::setBit(
-          rawResultNulls_, rowIndex, bits::isBitSet(rawResultNulls_, i));
+    if (moveNullsFrom && rowIndex != i) {
+      bits::setBit(rawResultNulls_, rowIndex, bits::isBitSet(moveNullsFrom, i));
     }
     valueRows_[rowIndex] = nextRow;
     rowIndex++;
@@ -239,7 +238,7 @@ void SelectiveColumnReader::compactScalarValues(RowSet rows, bool isFinal) {
   }
   vector_size_t rowIndex = 0;
   auto nextRow = rows[rowIndex];
-  bool moveNulls = shouldMoveNulls(rows);
+  auto* moveNullsFrom = shouldMoveNulls(rows);
   for (size_t i = 0; i < numValues_; i++) {
     if (sourceRows[i] < nextRow) {
       continue;
@@ -247,9 +246,8 @@ void SelectiveColumnReader::compactScalarValues(RowSet rows, bool isFinal) {
 
     VELOX_DCHECK(sourceRows[i] == nextRow);
     typedDestValues[rowIndex] = typedSourceValues[i];
-    if (moveNulls && rowIndex != i) {
-      bits::setBit(
-          rawResultNulls_, rowIndex, bits::isBitSet(rawResultNulls_, i));
+    if (moveNullsFrom && rowIndex != i) {
+      bits::setBit(rawResultNulls_, rowIndex, bits::isBitSet(moveNullsFrom, i));
     }
     if (!isFinal) {
       valueRows_[rowIndex] = nextRow;
@@ -310,7 +308,7 @@ void SelectiveColumnReader::compactComplexValues(
   }
   vector_size_t rowIndex = 0;
   auto nextRow = rows[rowIndex];
-  bool moveNulls = shouldMoveNulls(rows);
+  auto* moveNullsFrom = shouldMoveNulls(rows);
   for (size_t i = 0; i < numValues_; i++) {
     if (sourceRows[i] < nextRow) {
       continue;
@@ -319,9 +317,8 @@ void SelectiveColumnReader::compactComplexValues(
     VELOX_DCHECK(sourceRows[i] == nextRow);
     // The value at i is moved to be the value at 'rowIndex'.
     move(i, rowIndex);
-    if (moveNulls && rowIndex != i) {
-      bits::setBit(
-          rawResultNulls_, rowIndex, bits::isBitSet(rawResultNulls_, i));
+    if (moveNullsFrom && rowIndex != i) {
+      bits::setBit(rawResultNulls_, rowIndex, bits::isBitSet(moveNullsFrom, i));
     }
     if (!isFinal) {
       valueRows_[rowIndex] = nextRow;
diff --git a/velox/dwio/common/SelectiveStructColumnReader.cpp b/velox/dwio/common/SelectiveStructColumnReader.cpp
index 5ea0249c8898..bf8826f485b3 100644
--- a/velox/dwio/common/SelectiveStructColumnReader.cpp
+++ b/velox/dwio/common/SelectiveStructColumnReader.cpp
@@ -128,8 +128,8 @@ void SelectiveStructColumnReaderBase::read(
     activeRows = outputRows_;
   }
 
-  VELOX_CHECK(!children_.empty());
   auto& childSpecs = scanSpec_->children();
+  VELOX_CHECK(!childSpecs.empty());
   for (size_t i = 0; i < childSpecs.size(); ++i) {
     auto& childSpec = childSpecs[i];
     if (isChildConstant(*childSpec)) {
@@ -293,7 +293,7 @@ void setNullField(vector_size_t size, VectorPtr& field) {
 void SelectiveStructColumnReaderBase::getValues(
     RowSet rows,
     VectorPtr* result) {
-  VELOX_CHECK(!children_.empty());
+  VELOX_CHECK(!scanSpec_->children().empty());
   VELOX_CHECK(
       *result != nullptr,
       "SelectiveStructColumnReaderBase expects a non-null result");
diff --git a/velox/dwio/dwrf/test/CacheInputTest.cpp b/velox/dwio/dwrf/test/CacheInputTest.cpp
index a31d82146590..3bb9c588d2c2 100644
--- a/velox/dwio/dwrf/test/CacheInputTest.cpp
+++ b/velox/dwio/dwrf/test/CacheInputTest.cpp
@@ -114,6 +114,9 @@ class CacheTest : public testing::Test {
     if (ssdCache) {
       ssdCache->deleteFiles();
     }
+    if (cache_) {
+      cache_->prepareShutdown();
+    }
   }
 
   void initializeCache(uint64_t maxBytes, uint64_t ssdBytes = 0) {
@@ -130,10 +133,8 @@ class CacheTest : public testing::Test {
     }
     memory::MmapAllocator::Options options;
     options.capacity = maxBytes;
-    cache_ = std::make_shared<AsyncDataCache>(
-        std::make_shared<memory::MmapAllocator>(options),
-        maxBytes,
-        std::move(ssd));
+    allocator_ = std::make_shared<memory::MmapAllocator>(options);
+    cache_ = AsyncDataCache::create(allocator_.get(), std::move(ssd));
     cache_->setVerifyHook(checkEntry);
     for (auto i = 0; i < kMaxStreams; ++i) {
       streamIds_.push_back(std::make_unique<dwrf::DwrfStreamIdentifier>(
@@ -424,6 +425,7 @@ class CacheTest : public testing::Test {
   folly::F14FastMap<uint64_t, std::shared_ptr<TestReadFile>> pathToInput_;
   std::shared_ptr<exec::test::TempDirectoryPath> tempDirectory_;
   cache::FileGroupStats* FOLLY_NULLABLE groupStats_ = nullptr;
+  std::shared_ptr<memory::MemoryAllocator> allocator_;
   std::shared_ptr<AsyncDataCache> cache_;
   std::shared_ptr<IoStatistics> ioStats_;
   std::unique_ptr<folly::IOThreadPoolExecutor> executor_;
@@ -471,7 +473,7 @@ TEST_F(CacheTest, window) {
   auto cacheInput = dynamic_cast<CacheInputStream*>(stream.get());
   EXPECT_TRUE(cacheInput != nullptr);
   auto maxSize =
-      cache_->sizeClasses().back() * memory::AllocationTraits::kPageSize;
+      allocator_->sizeClasses().back() * memory::AllocationTraits::kPageSize;
   const void* buffer;
   int32_t size;
   int32_t numRead = 0;
diff --git a/velox/dwio/dwrf/test/ReaderTest.cpp b/velox/dwio/dwrf/test/ReaderTest.cpp
index 1a349c4abb68..fb58f9ba3bcb 100644
--- a/velox/dwio/dwrf/test/ReaderTest.cpp
+++ b/velox/dwio/dwrf/test/ReaderTest.cpp
@@ -1893,3 +1893,39 @@ TEST(TestReader, reuseRowNumberColumn) {
     ASSERT_NE(rowNum.get(), result->asUnchecked<RowVector>()->childAt(1).get());
   }
 }
+
+TEST(TestReader, failToReuseReaderNulls) {
+  auto* pool = defaultPool.get();
+  VectorMaker maker(pool);
+  auto c0 = maker.rowVector(
+      {"a", "b"},
+      {
+          maker.flatVector<int64_t>(11, folly::identity),
+          maker.flatVector<int64_t>(
+              11, folly::identity, [](auto i) { return i % 3 == 0; }),
+      });
+  // Set a null so that the children will not be loaded lazily.
+  bits::setNull(c0->mutableRawNulls(), 10);
+  auto data = maker.rowVector({
+      c0,
+      maker.rowVector({"c"}, {maker.flatVector<int64_t>(11, folly::identity)}),
+  });
+  auto schema = asRowType(data->type());
+  auto [writer, reader] = createWriterReader({data}, *pool);
+  auto spec = std::make_shared<common::ScanSpec>("<root>");
+  spec->addAllChildFields(*schema);
+  spec->childByName("c0")->childByName("a")->setFilter(
+      std::make_unique<common::BigintRange>(
+          0, std::numeric_limits<int64_t>::max(), false));
+  spec->childByName("c1")->childByName("c")->setFilter(
+      std::make_unique<common::BigintRange>(0, 4, false));
+  RowReaderOptions rowReaderOpts;
+  rowReaderOpts.setScanSpec(spec);
+  auto rowReader = reader->createRowReader(rowReaderOpts);
+  auto result = BaseVector::create(schema, 0, pool);
+  ASSERT_EQ(rowReader->next(10, result), 10);
+  ASSERT_EQ(result->size(), 5);
+  for (int i = 0; i < result->size(); ++i) {
+    ASSERT_TRUE(result->equalValueAt(data.get(), i, i)) << result->toString(i);
+  }
+}
diff --git a/velox/dwio/parquet/reader/ParquetReader.cpp b/velox/dwio/parquet/reader/ParquetReader.cpp
index 8672d1ad690c..f113feb04242 100644
--- a/velox/dwio/parquet/reader/ParquetReader.cpp
+++ b/velox/dwio/parquet/reader/ParquetReader.cpp
@@ -177,6 +177,7 @@ std::shared_ptr<const ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
               ParquetTypeWithId::kNonLeaf, // columnIdx,
               std::move(name),
               std::nullopt,
+              std::nullopt,
               maxRepeat + 1,
               maxDefine);
         }
@@ -196,6 +197,7 @@ std::shared_ptr<const ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
               ParquetTypeWithId::kNonLeaf, // columnIdx,
               std::move(name),
               std::nullopt,
+              std::nullopt,
               maxRepeat,
               maxDefine);
         }
@@ -220,6 +222,7 @@ std::shared_ptr<const ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
               ParquetTypeWithId::kNonLeaf, // columnIdx,
               std::move(name),
               std::nullopt,
+              std::nullopt,
               maxRepeat,
               maxDefine);
         } else if (children.size() == 2) {
@@ -234,6 +237,7 @@ std::shared_ptr<const ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
               ParquetTypeWithId::kNonLeaf, // columnIdx,
               std::move(name),
               std::nullopt,
+              std::nullopt,
               maxRepeat,
               maxDefine);
         }
@@ -248,6 +252,7 @@ std::shared_ptr<const ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
             ParquetTypeWithId::kNonLeaf, // columnIdx,
             std::move(name),
             std::nullopt,
+            std::nullopt,
             maxRepeat,
             maxDefine);
       }
@@ -260,6 +265,10 @@ std::shared_ptr<const ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
     int32_t type_length =
         schemaElement.__isset.type_length ? schemaElement.type_length : 0;
     std::vector<std::shared_ptr<const dwio::common::TypeWithId>> children;
+    const std::optional<thrift::LogicalType> logicalType_ =
+        schemaElement.__isset.logicalType
+        ? std::optional<thrift::LogicalType>(schemaElement.logicalType)
+        : std::nullopt;
     std::shared_ptr<const ParquetTypeWithId> leafTypePtr =
         std::make_shared<const ParquetTypeWithId>(
             veloxType,
@@ -269,6 +278,7 @@ std::shared_ptr<const ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
             columnIdx++,
             name,
             schemaElement.type,
+            logicalType_,
             maxRepeat,
             maxDefine,
             precision,
@@ -289,6 +299,7 @@ std::shared_ptr<const ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
           columnIdx++,
           std::move(name),
           std::nullopt,
+          std::nullopt,
           maxRepeat,
           maxDefine - 1);
     }
diff --git a/velox/dwio/parquet/reader/ParquetTypeWithId.h b/velox/dwio/parquet/reader/ParquetTypeWithId.h
index 9a382194ef06..1fa0d6b3f5d8 100644
--- a/velox/dwio/parquet/reader/ParquetTypeWithId.h
+++ b/velox/dwio/parquet/reader/ParquetTypeWithId.h
@@ -42,6 +42,7 @@ class ParquetTypeWithId : public dwio::common::TypeWithId {
       uint32_t column,
       std::string name,
       std::optional<thrift::Type::type> parquetType,
+      std::optional<thrift::LogicalType> logicalType,
       uint32_t maxRepeat,
       uint32_t maxDefine,
       int32_t precision = 0,
@@ -50,6 +51,7 @@ class ParquetTypeWithId : public dwio::common::TypeWithId {
       : TypeWithId(type, std::move(children), id, maxId, column),
         name_(name),
         parquetType_(parquetType),
+        logicalType_(std::move(logicalType)),
         maxRepeat_(maxRepeat),
         maxDefine_(maxDefine),
         precision_(precision),
@@ -74,6 +76,7 @@ class ParquetTypeWithId : public dwio::common::TypeWithId {
 
   const std::string name_;
   const std::optional<thrift::Type::type> parquetType_;
+  const std::optional<thrift::LogicalType> logicalType_;
   const uint32_t maxRepeat_;
   const uint32_t maxDefine_;
   const int32_t precision_;
diff --git a/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp b/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp
index 5a30c02bb5ac..3c219dccfa8d 100644
--- a/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp
+++ b/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp
@@ -400,6 +400,47 @@ TEST_F(ParquetTableScanTest, DISABLED_reqArrayLegacy) {
       "SELECT UNNEST(array[array['a', 'b'], array[], array['c', 'd']])");
 }
 
+TEST_F(ParquetTableScanTest, readAsLowerCase) {
+  auto plan = PlanBuilder(pool_.get())
+                  .tableScan(ROW({"a"}, {BIGINT()}), {}, "")
+                  .planNode();
+  CursorParameters params;
+  std::shared_ptr<folly::Executor> executor =
+      std::make_shared<folly::CPUThreadPoolExecutor>(
+          std::thread::hardware_concurrency());
+  std::shared_ptr<core::QueryCtx> queryCtx =
+      std::make_shared<core::QueryCtx>(executor.get());
+  std::unordered_map<std::string, std::string> configs = {
+      {std::string(
+           connector::hive::HiveConfig::kFileColumnNamesReadAsLowerCase),
+       "true"}};
+  queryCtx->setConnectorConfigOverridesUnsafe(
+      kHiveConnectorId, std::move(configs));
+  params.queryCtx = queryCtx;
+  params.planNode = plan;
+  const int numSplitsPerFile = 1;
+
+  bool noMoreSplits = false;
+  auto addSplits = [&](exec::Task* task) {
+    if (!noMoreSplits) {
+      auto const splits = HiveConnectorTestBase::makeHiveConnectorSplits(
+          {getExampleFilePath("upper.parquet")},
+          numSplitsPerFile,
+          dwio::common::FileFormat::PARQUET);
+      for (const auto& split : splits) {
+        task->addSplit("0", exec::Split(split));
+      }
+      task->noMoreSplits("0");
+    }
+    noMoreSplits = true;
+  };
+  auto result = readCursor(params, addSplits);
+  ASSERT_TRUE(waitForTaskCompletion(result.first->task().get()));
+  auto vector = makeFlatVector<int64_t>({0, 1});
+  auto expected = makeRowVector({"a"}, {vector});
+  assertEqualResults(result.second, {expected});
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   folly::init(&argc, &argv, false);
diff --git a/velox/examples/CMakeLists.txt b/velox/examples/CMakeLists.txt
index 013fa6b5db54..8625d34ce82e 100644
--- a/velox/examples/CMakeLists.txt
+++ b/velox/examples/CMakeLists.txt
@@ -20,12 +20,12 @@ target_link_libraries(velox_example_simple_functions velox_functions_lib
 add_executable(velox_example_expression_eval ExpressionEval.cpp)
 
 target_link_libraries(velox_example_expression_eval velox_type velox_vector
-                      velox_memory velox_expression)
+                      velox_caching velox_memory velox_expression)
 
 add_executable(velox_example_opaque_type OpaqueType.cpp)
 
 target_link_libraries(velox_example_opaque_type velox_type velox_vector
-                      velox_expression velox_memory)
+                      velox_caching velox_expression velox_memory)
 
 # This is disabled temporarily until we figure out why g++ is crashing linking
 # it on linux builds.
diff --git a/velox/exec/AggregateWindow.cpp b/velox/exec/AggregateWindow.cpp
index 054426a317af..54a3d91bd1c0 100644
--- a/velox/exec/AggregateWindow.cpp
+++ b/velox/exec/AggregateWindow.cpp
@@ -35,10 +35,13 @@ class AggregateWindowFunction : public exec::WindowFunction {
       const std::string& name,
       const std::vector<exec::WindowFunctionArg>& args,
       const TypePtr& resultType,
+      bool ignoreNulls,
       velox::memory::MemoryPool* pool,
       HashStringAllocator* stringAllocator,
       const core::QueryConfig& config)
       : WindowFunction(resultType, pool, stringAllocator) {
+    VELOX_USER_CHECK(
+        !ignoreNulls, "Aggregate window functions do not support IGNORE NULLS");
     argTypes_.reserve(args.size());
     argIndices_.reserve(args.size());
     argVectors_.reserve(args.size());
@@ -391,13 +394,19 @@ void registerAggregateWindowFunction(const std::string& name) {
         [name](
             const std::vector<exec::WindowFunctionArg>& args,
             const TypePtr& resultType,
-            bool /*ignoreNulls*/,
+            bool ignoreNulls,
             velox::memory::MemoryPool* pool,
             HashStringAllocator* stringAllocator,
             const core::QueryConfig& config)
             -> std::unique_ptr<exec::WindowFunction> {
           return std::make_unique<AggregateWindowFunction>(
-              name, args, resultType, pool, stringAllocator, config);
+              name,
+              args,
+              resultType,
+              ignoreNulls,
+              pool,
+              stringAllocator,
+              config);
         });
   }
 }
diff --git a/velox/exec/GroupingSet.cpp b/velox/exec/GroupingSet.cpp
index abd7ac6c2cc5..e3f616289b87 100644
--- a/velox/exec/GroupingSet.cpp
+++ b/velox/exec/GroupingSet.cpp
@@ -307,10 +307,15 @@ namespace {
 
 void initializeAggregates(
     const std::vector<AggregateInfo>& aggregates,
-    RowContainer& rows) {
+    RowContainer& rows,
+    bool excludeToIntermediate) {
   const auto numKeys = rows.keyTypes().size();
-  for (auto i = 0; i < aggregates.size(); ++i) {
-    auto& function = aggregates[i].function;
+  int i = 0;
+  for (auto& aggregate : aggregates) {
+    auto& function = aggregate.function;
+    if (excludeToIntermediate && function->supportsToIntermediate()) {
+      continue;
+    }
     function->setAllocator(&rows.stringAllocator());
 
     const auto rowColumn = rows.columnAt(numKeys + i);
@@ -319,15 +324,19 @@ void initializeAggregates(
         rowColumn.nullByte(),
         rowColumn.nullMask(),
         rows.rowSizeOffset());
+    ++i;
   }
 }
 } // namespace
 
-std::vector<Accumulator> GroupingSet::accumulators() {
+std::vector<Accumulator> GroupingSet::accumulators(bool excludeToIntermediate) {
   std::vector<Accumulator> accumulators;
   accumulators.reserve(aggregates_.size());
   for (auto& aggregate : aggregates_) {
-    accumulators.push_back(Accumulator{aggregate.function.get()});
+    if (!excludeToIntermediate ||
+        !aggregate.function->supportsToIntermediate()) {
+      accumulators.push_back(Accumulator{aggregate.function.get()});
+    }
   }
 
   if (sortedAggregations_ != nullptr) {
@@ -345,14 +354,14 @@ std::vector<Accumulator> GroupingSet::accumulators() {
 void GroupingSet::createHashTable() {
   if (ignoreNullKeys_) {
     table_ = HashTable<true>::createForAggregation(
-        std::move(hashers_), accumulators(), &pool_);
+        std::move(hashers_), accumulators(false), &pool_);
   } else {
     table_ = HashTable<false>::createForAggregation(
-        std::move(hashers_), accumulators(), &pool_);
+        std::move(hashers_), accumulators(false), &pool_);
   }
 
   RowContainer& rows = *table_->rows();
-  initializeAggregates(aggregates_, rows);
+  initializeAggregates(aggregates_, rows, false);
 
   auto numColumns = rows.keyTypes().size() + aggregates_.size();
 
@@ -637,9 +646,6 @@ bool GroupingSet::getOutput(
     if (table_) {
       table_->clear();
     }
-    if (remainingInput_) {
-      addRemainingInput();
-    }
     return false;
   }
   extractGroups(folly::Range<char**>(groups, numGroups), result);
@@ -866,15 +872,16 @@ bool GroupingSet::getOutputWithSpill(
     mergeRows_ = std::make_unique<RowContainer>(
         keyTypes,
         !ignoreNullKeys_,
-        accumulators(),
+        accumulators(false),
         std::vector<TypePtr>(),
         false,
         false,
         false,
         false,
-        &pool_);
+        &pool_,
+        table_->rows()->stringAllocatorShared());
 
-    initializeAggregates(aggregates_, *mergeRows_);
+    initializeAggregates(aggregates_, *mergeRows_, false);
 
     // Take ownership of the rows and free the hash table. The table will not be
     // needed for producing spill output.
@@ -978,11 +985,20 @@ void GroupingSet::abandonPartialAggregation() {
     }
   }
 
-  VELOX_CHECK_EQ(table_->rows()->numRows(), 0)
-  intermediateRows_ = table_->moveRows();
-  intermediateRows_->clear();
-
-  table_ = nullptr;
+  VELOX_CHECK_EQ(table_->rows()->numRows(), 0);
+  intermediateRows_ = std::make_unique<RowContainer>(
+      table_->rows()->keyTypes(),
+      !ignoreNullKeys_,
+      accumulators(true),
+      std::vector<TypePtr>(),
+      false,
+      false,
+      false,
+      false,
+      &pool_,
+      table_->rows()->stringAllocatorShared());
+  initializeAggregates(aggregates_, *intermediateRows_, true);
+  table_.reset();
 }
 
 void GroupingSet::toIntermediate(
diff --git a/velox/exec/GroupingSet.h b/velox/exec/GroupingSet.h
index dc0e11bad517..05b42a4c60b8 100644
--- a/velox/exec/GroupingSet.h
+++ b/velox/exec/GroupingSet.h
@@ -191,9 +191,11 @@ class GroupingSet {
   // groups.
   void extractSpillResult(const RowVectorPtr& result);
 
-  // Return a list of accumulators for 'aggregates_' plus one more accumulator
-  // for 'sortedAggregations_'.
-  std::vector<Accumulator> accumulators();
+  // Return a list of accumulators for 'aggregates_', plus one more accumulator
+  // for 'sortedAggregations_', and one for each 'distinctAggregations_'.  When
+  // 'excludeToIntermediate' is true, skip the functions that support
+  // 'toIntermediate'.
+  std::vector<Accumulator> accumulators(bool excludeToIntermediate);
 
   std::vector<column_index_t> keyChannels_;
 
diff --git a/velox/exec/HashBuild.cpp b/velox/exec/HashBuild.cpp
index 5ce6e86d3fad..37e9599f6f85 100644
--- a/velox/exec/HashBuild.cpp
+++ b/velox/exec/HashBuild.cpp
@@ -777,23 +777,10 @@ bool HashBuild::finishHashBuild() {
       // https://github.com/facebookincubator/velox/issues/3567 is fixed.
       const bool allowParallelJoinBuild =
           !otherTables.empty() && spillPartitions.empty();
-      // Inject test value to catch the memory allocations from parallel join
-      // build.
-      if (TestValue::enabled()) {
-        std::vector<Operator*> buildOps;
-        buildOps.reserve(peers.size());
-        for (auto& peer : peers) {
-          auto* op = peer->findOperator(planNodeId());
-          buildOps.push_back(op);
-        }
-        TestValue::adjust(
-            "facebook::velox::exec::HashBuild::prepareJoinTable", &buildOps);
-      }
       table_->prepareJoinTable(
           std::move(otherTables),
           allowParallelJoinBuild ? operatorCtx_->task()->queryCtx()->executor()
                                  : nullptr);
-
       addRuntimeStats();
       if (joinBridge_->setHashTable(
               std::move(table_),
diff --git a/velox/exec/HashTable.cpp b/velox/exec/HashTable.cpp
index 64af72f58ebe..257781b0316a 100644
--- a/velox/exec/HashTable.cpp
+++ b/velox/exec/HashTable.cpp
@@ -764,7 +764,7 @@ bool HashTable<ignoreNullKeys>::canApplyParallelJoinBuild() const {
 template <bool ignoreNullKeys>
 void HashTable<ignoreNullKeys>::parallelJoinBuild() {
   TestValue::adjust(
-      "facebook::velox::exec::HashTable::parallelJoinBuild", nullptr);
+      "facebook::velox::exec::HashTable::parallelJoinBuild", this);
   int32_t numPartitions = 1 + otherTables_.size();
   VELOX_CHECK_GT(
       capacity_ / numPartitions,
@@ -799,11 +799,15 @@ void HashTable<ignoreNullKeys>::parallelJoinBuild() {
     syncWorkItems(buildSteps, error, true);
   });
 
+  // The parallel table partitioning step.
+  std::vector<std::unique_ptr<RowPartitions>> rowPartitions;
+  rowPartitions.reserve(numPartitions);
   for (auto i = 0; i < numPartitions; ++i) {
-    auto table = i == 0 ? this : otherTables_[i - 1].get();
-    partitionSteps.push_back(
-        std::make_shared<AsyncSource<bool>>([this, table, numPartitions]() {
-          partitionRows(*table);
+    auto* table = i == 0 ? this : otherTables_[i - 1].get();
+    rowPartitions.push_back(table->rows()->createRowPartitions(*rows_->pool()));
+    partitionSteps.push_back(std::make_shared<AsyncSource<bool>>(
+        [this, table, rawRowPartitions = rowPartitions.back().get()]() {
+          partitionRows(*table, *rawRowPartitions);
           return std::make_unique<bool>(true);
         }));
     assert(!partitionSteps.empty()); // lint
@@ -814,20 +818,23 @@ void HashTable<ignoreNullKeys>::parallelJoinBuild() {
   if (error) {
     std::rethrow_exception(error);
   }
+
+  // The parallel table building step.
   std::vector<std::vector<char*>> overflowPerPartition(numPartitions);
   for (auto i = 0; i < numPartitions; ++i) {
-    buildSteps.push_back(
-        std::make_shared<AsyncSource<bool>>([i, &overflowPerPartition, this]() {
-          buildJoinPartition(i, overflowPerPartition[i]);
+    buildSteps.push_back(std::make_shared<AsyncSource<bool>>(
+        [this, i, &overflowPerPartition, &rowPartitions]() {
+          buildJoinPartition(i, rowPartitions, overflowPerPartition[i]);
           return std::make_unique<bool>(true);
         }));
-    assert(!buildSteps.empty()); // lint
+    VELOX_CHECK(!buildSteps.empty());
     buildExecutor_->add([step = buildSteps.back()]() { step->prepare(); });
   }
   syncWorkItems(buildSteps, error);
   if (error) {
     std::rethrow_exception(error);
   }
+
   raw_vector<uint64_t> hashes;
   for (auto i = 0; i < numPartitions; ++i) {
     auto& overflows = overflowPerPartition[i];
@@ -872,7 +879,8 @@ int32_t findPartition(
 
 template <bool ignoreNullKeys>
 void HashTable<ignoreNullKeys>::partitionRows(
-    HashTable<ignoreNullKeys>& subtable) {
+    HashTable<ignoreNullKeys>& subtable,
+    RowPartitions& rowPartitions) {
   constexpr int32_t kBatch = 1024;
   raw_vector<char*> rows(kBatch);
   raw_vector<uint64_t> hashes(kBatch);
@@ -891,7 +899,7 @@ void HashTable<ignoreNullKeys>::partitionRows(
       partitions[i] = findPartition(
           index, buildPartitionBounds_.data(), buildPartitionBounds_.size());
     }
-    subtable.rows_->partitions().appendPartitions(
+    rowPartitions.appendPartitions(
         folly::Range<const uint8_t*>(partitions.data(), numRows));
   }
 }
@@ -899,6 +907,7 @@ void HashTable<ignoreNullKeys>::partitionRows(
 template <bool ignoreNullKeys>
 void HashTable<ignoreNullKeys>::buildJoinPartition(
     uint8_t partition,
+    const std::vector<std::unique_ptr<RowPartitions>>& rowPartitions,
     std::vector<char*>& overflow) {
   constexpr int32_t kBatch = 1024;
   raw_vector<char*> rows(kBatch);
@@ -908,7 +917,7 @@ void HashTable<ignoreNullKeys>::buildJoinPartition(
     auto table = i == 0 ? this : otherTables_[i - 1].get();
     RowContainerIterator iter;
     while (auto numRows = table->rows_->listPartitionRows(
-               iter, partition, kBatch, rows.data())) {
+               iter, partition, kBatch, *rowPartitions[i], rows.data())) {
       hashRows(folly::Range(rows.data(), numRows), false, hashes);
       insertForJoin(
           rows.data(),
diff --git a/velox/exec/HashTable.h b/velox/exec/HashTable.h
index 2f907c095e33..6c0788ca83f8 100644
--- a/velox/exec/HashTable.h
+++ b/velox/exec/HashTable.h
@@ -596,12 +596,17 @@ class HashTable : public BaseHashTable {
   // Inserts the rows in 'partition' from this and 'otherTables' into 'this'.
   // The rows that would have gone past the end of the partition are returned in
   // 'overflow'.
-  void buildJoinPartition(uint8_t partition, std::vector<char*>& overflow);
+  void buildJoinPartition(
+      uint8_t partition,
+      const std::vector<std::unique_ptr<RowPartitions>>& rowPartitions,
+      std::vector<char*>& overflow);
 
   // Assigns a partition to each row of 'subtable' in RowPartitions of
   // subtable's RowContainer. If 'hashMode_' is kNormalizedKeys, records the
   // normalized key of each row below the row in its container.
-  void partitionRows(HashTable<ignoreNullKeys>& subtable);
+  void partitionRows(
+      HashTable<ignoreNullKeys>& subtable,
+      RowPartitions& rowPartitions);
 
   // Calculates hashes for 'rows' and returns them in 'hashes'. If
   // 'initNormalizedKeys' is true, the normalized keys are stored
diff --git a/velox/exec/Operator.cpp b/velox/exec/Operator.cpp
index d0f44c536693..0f3114fbe916 100644
--- a/velox/exec/Operator.cpp
+++ b/velox/exec/Operator.cpp
@@ -54,7 +54,7 @@ OperatorCtx::createConnectorQueryCtx(
       driverCtx_->task->queryCtx()->getConnectorConfig(connectorId),
       std::make_unique<SimpleExpressionEvaluator>(
           execCtx()->queryCtx(), execCtx()->pool()),
-      driverCtx_->task->queryCtx()->allocator(),
+      driverCtx_->task->queryCtx()->cache(),
       driverCtx_->task->queryCtx()->queryId(),
       taskId(),
       planNodeId,
diff --git a/velox/exec/PartitionedOutput.cpp b/velox/exec/PartitionedOutput.cpp
index 50d301c75b43..20d000ddb6eb 100644
--- a/velox/exec/PartitionedOutput.cpp
+++ b/velox/exec/PartitionedOutput.cpp
@@ -103,7 +103,7 @@ BlockingReason Destination::flush(
 
 PartitionedOutput::PartitionedOutput(
     int32_t operatorId,
-    DriverCtx* FOLLY_NONNULL ctx,
+    DriverCtx* ctx,
     const std::shared_ptr<const core::PartitionedOutputNode>& planNode)
     : Operator(
           ctx,
@@ -131,9 +131,12 @@ PartitionedOutput::PartitionedOutput(
       maxBufferedBytes_(ctx->task->queryCtx()
                             ->queryConfig()
                             .maxPartitionedOutputBufferSize()) {
-  if (numDestinations_ == 1 || planNode->isBroadcast()) {
-    VELOX_CHECK(keyChannels_.empty());
-    VELOX_CHECK_NULL(partitionFunction_);
+  if (!planNode->isPartitioned()) {
+    VELOX_USER_CHECK_EQ(numDestinations_, 1);
+  }
+  if (numDestinations_ == 1) {
+    VELOX_USER_CHECK(keyChannels_.empty());
+    VELOX_USER_CHECK_NULL(partitionFunction_);
   }
 }
 
diff --git a/velox/exec/PartitionedOutput.h b/velox/exec/PartitionedOutput.h
index 16ae62cbf85c..0da3801dceb2 100644
--- a/velox/exec/PartitionedOutput.h
+++ b/velox/exec/PartitionedOutput.h
@@ -27,7 +27,7 @@ class Destination {
   Destination(
       const std::string& taskId,
       int destination,
-      memory::MemoryPool* FOLLY_NONNULL pool)
+      memory::MemoryPool* pool)
       : taskId_(taskId), destination_(destination), pool_(pool) {
     setTargetSizePct();
   }
@@ -52,13 +52,13 @@ class Destination {
       const RowVectorPtr& output,
       PartitionedOutputBufferManager& bufferManager,
       const std::function<void()>& bufferReleaseFn,
-      bool* FOLLY_NONNULL atEnd,
-      ContinueFuture* FOLLY_NONNULL future);
+      bool* atEnd,
+      ContinueFuture* future);
 
   BlockingReason flush(
       PartitionedOutputBufferManager& bufferManager,
       const std::function<void()>& bufferReleaseFn,
-      ContinueFuture* FOLLY_NULLABLE future);
+      ContinueFuture* future);
 
   bool isFinished() const {
     return finished_;
@@ -91,7 +91,7 @@ class Destination {
 
   const std::string taskId_;
   const int destination_;
-  memory::MemoryPool* FOLLY_NONNULL const pool_;
+  memory::MemoryPool* const pool_;
   uint64_t bytesInCurrent_{0};
   std::vector<IndexRange> rows_;
 
@@ -130,7 +130,7 @@ class PartitionedOutput : public Operator {
 
   PartitionedOutput(
       int32_t operatorId,
-      DriverCtx* FOLLY_NONNULL ctx,
+      DriverCtx* ctx,
       const std::shared_ptr<const core::PartitionedOutputNode>& planNode);
 
   void addInput(RowVectorPtr input) override;
@@ -146,7 +146,7 @@ class PartitionedOutput : public Operator {
     return true;
   }
 
-  BlockingReason isBlocked(ContinueFuture* FOLLY_NONNULL future) override {
+  BlockingReason isBlocked(ContinueFuture* future) override {
     if (blockingReason_ != BlockingReason::kNotBlocked) {
       *future = std::move(future_);
       blockingReason_ = BlockingReason::kNotBlocked;
diff --git a/velox/exec/PartitionedOutputBufferManager.cpp b/velox/exec/PartitionedOutputBufferManager.cpp
index 1a7a36dd6e9d..fc124463dcb7 100644
--- a/velox/exec/PartitionedOutputBufferManager.cpp
+++ b/velox/exec/PartitionedOutputBufferManager.cpp
@@ -242,11 +242,12 @@ PartitionedOutputBuffer::PartitionedOutputBuffer(
 void PartitionedOutputBuffer::updateOutputBuffers(
     int numBuffers,
     bool noMoreBuffers) {
-  VELOX_CHECK(
-      !isPartitioned(),
-      "{} is not supported on {} output buffer",
-      __FUNCTION__,
-      kind_);
+  if (isPartitioned()) {
+    VELOX_CHECK_EQ(buffers_.size(), numBuffers);
+    VELOX_CHECK(noMoreBuffers);
+    noMoreBuffers_ = true;
+    return;
+  }
 
   std::vector<ContinuePromise> promises;
   bool isFinished;
@@ -476,9 +477,17 @@ bool PartitionedOutputBuffer::isFinished() {
 }
 
 bool PartitionedOutputBuffer::isFinishedLocked() {
-  if (!isPartitioned() && !noMoreBuffers_) {
+  // NOTE: for broadcast output buffer, we can only mark it as finished after
+  // receiving the no more (destination) buffers signal.
+  if (isBroadcast() && !noMoreBuffers_) {
     return false;
   }
+  if (isArbitrary()) {
+    VELOX_CHECK_NOT_NULL(arbitraryBuffer_);
+    if (!arbitraryBuffer_->empty()) {
+      return false;
+    }
+  }
   for (auto& buffer : buffers_) {
     if (buffer != nullptr) {
       return false;
diff --git a/velox/exec/RowContainer.cpp b/velox/exec/RowContainer.cpp
index c9053d656372..dd2a0ee066bb 100644
--- a/velox/exec/RowContainer.cpp
+++ b/velox/exec/RowContainer.cpp
@@ -107,14 +107,17 @@ RowContainer::RowContainer(
     bool isJoinBuild,
     bool hasProbedFlag,
     bool hasNormalizedKeys,
-    memory::MemoryPool* pool)
+    memory::MemoryPool* pool,
+    std::shared_ptr<HashStringAllocator> stringAllocator)
     : keyTypes_(keyTypes),
       nullableKeys_(nullableKeys),
-      accumulators_(accumulators),
       isJoinBuild_(isJoinBuild),
+      accumulators_(accumulators),
       hasNormalizedKeys_(hasNormalizedKeys),
       rows_(pool),
-      stringAllocator_(pool) {
+      stringAllocator_(
+          stringAllocator ? stringAllocator
+                          : std::make_shared<HashStringAllocator>(pool)) {
   // Compute the layout of the payload row.  The row has keys, null
   // flags, accumulators, dependent fields. All fields are fixed
   // width. If variable width data is referenced, this is done with
@@ -234,11 +237,14 @@ RowContainer::RowContainer(
   }
 }
 
+RowContainer::~RowContainer() {
+  clear();
+}
+
 char* RowContainer::newRow() {
-  char* row;
-  VELOX_DCHECK(
-      !partitions_, "Rows may not be added after partitions() has been called");
+  VELOX_DCHECK(mutable_, "Can't add row into an immutable row container");
   ++numRows_;
+  char* row;
   if (firstFreeRow_) {
     row = firstFreeRow_;
     VELOX_CHECK(bits::isBitSet(row, freeFlagOffset_));
@@ -259,8 +265,11 @@ char* RowContainer::initializeRow(char* row, bool reuse) {
     auto rows = folly::Range<char**>(&row, 1);
     freeVariableWidthFields(rows);
     freeAggregates(rows);
+  } else if (rowSizeOffset_ != 0 && checkFree_) {
+    // zero out string views so that clear() will not hit uninited data. The
+    // fastest way is to set the whole row to 0.
+    ::memset(row, 0, fixedRowSize_);
   }
-
   if (!nullOffsets_.empty()) {
     memcpy(
         row + nullByte(nullOffsets_[0]),
@@ -300,7 +309,11 @@ void RowContainer::freeVariableWidthFields(folly::Range<char**> rows) {
           if (!isNullAt(row, column.nullByte(), column.nullMask())) {
             StringView view = valueAt<StringView>(row, column.offset());
             if (!view.isInline()) {
-              stringAllocator_.free(HashStringAllocator::headerOf(view.data()));
+              stringAllocator_->free(
+                  HashStringAllocator::headerOf(view.data()));
+              if (checkFree_) {
+                valueAt<StringView>(row, column.offset()) = StringView();
+              }
             }
           }
         }
@@ -421,11 +434,11 @@ void RowContainer::storeComplexType(
     row[nullByte] |= nullMask;
     return;
   }
-  RowSizeTracker tracker(row[rowSizeOffset_], stringAllocator_);
-  ByteStream stream(&stringAllocator_, false, false);
-  auto position = stringAllocator_.newWrite(stream);
+  RowSizeTracker tracker(row[rowSizeOffset_], *stringAllocator_);
+  ByteStream stream(stringAllocator_.get(), false, false);
+  auto position = stringAllocator_->newWrite(stream);
   ContainerRowSerde::serialize(*decoded.base(), decoded.index(index), stream);
-  stringAllocator_.finishWrite(stream, 0);
+  stringAllocator_->finishWrite(stream, 0);
   valueAt<StringView>(row, offset) =
       StringView(reinterpret_cast<char*>(position.position), stream.size());
 }
@@ -543,22 +556,22 @@ void RowContainer::hash(
 }
 
 void RowContainer::clear() {
-  if (usesExternalMemory_) {
+  const bool sharedStringAllocator = !stringAllocator_.unique();
+  if (checkFree_ || sharedStringAllocator || usesExternalMemory_) {
     constexpr int32_t kBatch = 1000;
     std::vector<char*> rows(kBatch);
-
     RowContainerIterator iter;
-    for (;;) {
-      int64_t numRows = listRows(&iter, kBatch, rows.data());
-      if (!numRows) {
-        break;
-      }
-      auto rowsData = folly::Range<char**>(rows.data(), numRows);
-      freeAggregates(rowsData);
+    while (auto numRows = listRows(&iter, kBatch, rows.data())) {
+      eraseRows(folly::Range<char**>(rows.data(), numRows));
     }
   }
   rows_.clear();
-  stringAllocator_.clear();
+  if (!sharedStringAllocator) {
+    if (checkFree_) {
+      stringAllocator_->checkEmpty();
+    }
+    stringAllocator_->clear();
+  }
   numRows_ = 0;
   numRowsWithNormalizedKey_ = 0;
   normalizedKeySize_ = originalNormalizedKeySize_;
@@ -618,7 +631,7 @@ std::optional<int64_t> RowContainer::estimateRowSize() const {
   }
   int64_t freeBytes = rows_.freeBytes() + fixedRowSize_ * numFreeRows_;
   int64_t usedSize = rows_.allocatedBytes() - freeBytes +
-      stringAllocator_.retainedSize() - stringAllocator_.freeSpace();
+      stringAllocator_->retainedSize() - stringAllocator_->freeSpace();
   int64_t rowSize = usedSize / numRows_;
   VELOX_CHECK_GT(
       rowSize, 0, "Estimated row size of the RowContainer must be positive.");
@@ -633,7 +646,7 @@ int64_t RowContainer::sizeIncrement(
   constexpr int32_t kAllocUnit = memory::AllocationTraits::kHugePageSize;
   int32_t needRows = std::max<int64_t>(0, numRows - numFreeRows_);
   int64_t needBytes =
-      std::max<int64_t>(0, variableLengthBytes - stringAllocator_.freeSpace());
+      std::max<int64_t>(0, variableLengthBytes - stringAllocator_->freeSpace());
   return bits::roundUp(needRows * fixedRowSize_, kAllocUnit) +
       bits::roundUp(needBytes, kAllocUnit);
 }
@@ -683,28 +696,30 @@ void RowContainer::skip(RowContainerIterator& iter, int32_t numRows) {
   iter.rowNumber += numRows;
 }
 
-RowPartitions& RowContainer::partitions() {
-  if (!partitions_) {
-    partitions_ = std::make_unique<RowPartitions>(numRows_, *rows_.pool());
-  }
-  return *partitions_;
+std::unique_ptr<RowPartitions> RowContainer::createRowPartitions(
+    memory::MemoryPool& pool) {
+  VELOX_CHECK(
+      mutable_, "Can only create RowPartitions once from a row container");
+  mutable_ = false;
+  return std::make_unique<RowPartitions>(numRows_, pool);
 }
 
 int32_t RowContainer::listPartitionRows(
     RowContainerIterator& iter,
     uint8_t partition,
     int32_t maxRows,
+    const RowPartitions& rowPartitions,
     char** result) {
-  if (!numRows_) {
-    return 0;
-  }
   VELOX_CHECK(
-      partitions_, "partitions() must be called before listPartitionRows()");
+      !mutable_, "Can't list partition rows from a mutable row container");
   VELOX_CHECK_EQ(
-      partitions_->size(), numRows_, "All rows must have a partition");
-  auto partitionNumberVector = xsimd::batch<uint8_t>::broadcast(partition);
-  auto& allocation = partitions_->allocation();
-  auto numRuns = allocation.numRuns();
+      rowPartitions.size(), numRows_, "All rows must have a partition");
+  if (numRows_ == 0) {
+    return 0;
+  }
+  const auto partitionNumberVector =
+      xsimd::batch<uint8_t>::broadcast(partition);
+  const auto& allocation = rowPartitions.allocation();
   int32_t numResults = 0;
   while (numResults < maxRows && iter.rowNumber < numRows_) {
     constexpr int32_t kBatch = xsimd::batch<uint8_t>::size;
@@ -762,10 +777,10 @@ int32_t RowContainer::listPartitionRows(
 
 RowPartitions::RowPartitions(int32_t numRows, memory::MemoryPool& pool)
     : capacity_(numRows) {
-  auto numPages =
-      bits::roundUp(capacity_, memory::AllocationTraits::kPageSize) /
-      memory::AllocationTraits::kPageSize;
-  pool.allocateNonContiguous(numPages, allocation_);
+  const auto numPages = memory::AllocationTraits::numPages(capacity_);
+  if (numPages > 0) {
+    pool.allocateNonContiguous(numPages, allocation_);
+  }
 }
 
 void RowPartitions::appendPartitions(folly::Range<const uint8_t*> partitions) {
diff --git a/velox/exec/RowContainer.h b/velox/exec/RowContainer.h
index 1af2166dc49a..8b99da63ac8a 100644
--- a/velox/exec/RowContainer.h
+++ b/velox/exec/RowContainer.h
@@ -186,6 +186,8 @@ class RowContainer {
             false, // hasNormalizedKey
             pool) {}
 
+  ~RowContainer();
+
   static int32_t combineAlignments(int32_t a, int32_t b);
 
   // 'keyTypes' gives the type of the key of each row. For a group by,
@@ -204,6 +206,10 @@ class RowContainer {
   // into one word for faster comparison. The bulk allocation is done
   // from 'allocator'. ContainerRowSerde is used for serializing complex
   // type values into the container.
+  /// ''stringAllocator' allows sharing the variable length data arena with
+  /// another RowContainer. this is
+  // needed for spilling where the same aggregates are used for
+  // reading one container and merging into another.
   RowContainer(
       const std::vector<TypePtr>& keyTypes,
       bool nullableKeys,
@@ -213,7 +219,8 @@ class RowContainer {
       bool isJoinBuild,
       bool hasProbedFlag,
       bool hasNormalizedKey,
-      memory::MemoryPool* FOLLY_NONNULL pool);
+      memory::MemoryPool* FOLLY_NONNULL pool,
+      std::shared_ptr<HashStringAllocator> stringAllocator = nullptr);
 
   // Allocates a new row and initializes possible aggregates to null.
   char* FOLLY_NONNULL newRow();
@@ -264,6 +271,10 @@ class RowContainer {
       int32_t columnIndex);
 
   HashStringAllocator& stringAllocator() {
+    return *stringAllocator_;
+  }
+
+  const std::shared_ptr<HashStringAllocator>& stringAllocatorShared() {
     return stringAllocator_;
   }
 
@@ -414,7 +425,9 @@ class RowContainer {
       auto range = rows_.rangeAt(i);
       auto* data =
           range.data() + memory::alignmentPadding(range.data(), alignment_);
-      auto limit = range.size();
+      auto limit = range.size() -
+          (reinterpret_cast<uintptr_t>(data) -
+           reinterpret_cast<uintptr_t>(range.data()));
       auto row = iter->rowOffset;
       while (row + rowSize <= limit) {
         rows[count++] = data + row +
@@ -575,7 +588,7 @@ class RowContainer {
       uint64_t* FOLLY_NONNULL result);
 
   uint64_t allocatedBytes() const {
-    return rows_.allocatedBytes() + stringAllocator_.retainedSize();
+    return rows_.allocatedBytes() + stringAllocator_->retainedSize();
   }
 
   // Returns the number of fixed size rows that can be allocated
@@ -584,7 +597,7 @@ class RowContainer {
   std::pair<uint64_t, uint64_t> freeSpace() const {
     return std::make_pair<uint64_t, uint64_t>(
         rows_.freeBytes() / fixedRowSize_ + numFreeRows_,
-        stringAllocator_.freeSpace());
+        stringAllocator_->freeSpace());
   }
 
   // Returns the average size of rows in bytes stored in this container.
@@ -614,7 +627,7 @@ class RowContainer {
   }
 
   memory::MemoryPool* FOLLY_NONNULL pool() const {
-    return stringAllocator_.pool();
+    return stringAllocator_->pool();
   }
 
   // Returns the types of all non-aggregate columns of 'this', keys first.
@@ -631,7 +644,7 @@ class RowContainer {
   }
 
   const HashStringAllocator& stringAllocator() const {
-    return stringAllocator_;
+    return *stringAllocator_;
   }
 
   // Checks that row and free row counts match and that free list
@@ -643,26 +656,32 @@ class RowContainer {
     return (row[nullByte] & nullMask) != 0;
   }
 
-  /// Retrieves rows from 'iterator' whose partition equals
-  /// 'partition'. Writes up to 'maxRows' pointers to the rows in
-  /// 'result'. Returns the number of rows retrieved, 0 when no more
-  /// rows are found. 'iterator' is expected to be in initial state
-  /// on first call.
+  /// Creates a container to store a partition number for each row in this row
+  /// container. This is used by parallel join build which is responsible for
+  /// filling this. This function also marks this row container as immutable
+  /// after this call, we expect the user only call this once.
+  std::unique_ptr<RowPartitions> createRowPartitions(memory::MemoryPool& pool);
+
+  /// Retrieves rows from 'iterator' whose partition equals 'partition'. Writes
+  /// up to 'maxRows' pointers to the rows in 'result'. 'rowPartitions' contains
+  /// the partition number of each row in this container. The function returns
+  /// the number of rows retrieved, 0 when no more rows are found. 'iterator' is
+  /// expected to be in initial state on first call.
   int32_t listPartitionRows(
       RowContainerIterator& iterator,
       uint8_t partition,
       int32_t maxRows,
+      const RowPartitions& rowPartitions,
       char* FOLLY_NONNULL* FOLLY_NONNULL result);
 
-  /// Returns a container with a partition number for each row. This
-  /// is created on first use. The caller is responsible for filling
-  /// this.
-  RowPartitions& partitions();
-
   /// Advances 'iterator' by 'numRows'. The current row after skip is
   /// in iter.currentRow(). This is null if past end. Public for testing.
   void skip(RowContainerIterator& iterator, int32_t numRows);
 
+  bool testingMutable() const {
+    return mutable_;
+  }
+
  private:
   // Offset of the pointer to the next free row on a free row.
   static constexpr int32_t kNextFreeOffset = 0;
@@ -758,8 +777,8 @@ class RowContainer {
     }
     *reinterpret_cast<T*>(row + offset) = decoded.valueAt<T>(index);
     if constexpr (std::is_same_v<T, StringView>) {
-      RowSizeTracker tracker(row[rowSizeOffset_], stringAllocator_);
-      stringAllocator_.copyMultipart(row, offset);
+      RowSizeTracker tracker(row[rowSizeOffset_], *stringAllocator_);
+      stringAllocator_->copyMultipart(row, offset);
     }
   }
 
@@ -772,8 +791,8 @@ class RowContainer {
     using T = typename TypeTraits<Kind>::NativeType;
     *reinterpret_cast<T*>(group + offset) = decoded.valueAt<T>(index);
     if constexpr (std::is_same_v<T, StringView>) {
-      RowSizeTracker tracker(group[rowSizeOffset_], stringAllocator_);
-      stringAllocator_.copyMultipart(group, offset);
+      RowSizeTracker tracker(group[rowSizeOffset_], *stringAllocator_);
+      stringAllocator_->copyMultipart(group, offset);
     }
   }
 
@@ -1085,8 +1104,16 @@ class RowContainer {
   // Free any aggregates associated with the 'rows'.
   void freeAggregates(folly::Range<char**> rows);
 
+  const bool checkFree_ = false;
+
   const std::vector<TypePtr> keyTypes_;
   const bool nullableKeys_;
+  const bool isJoinBuild_;
+
+  // Indicates if we can add new row to this row container. It is set to false
+  // after user calls 'getRowPartitions()' to create 'rowPartitions' object for
+  // parallel join build.
+  bool mutable_{true};
 
   std::vector<Accumulator> accumulators_;
 
@@ -1095,7 +1122,6 @@ class RowContainer {
   // to 'typeKinds_' and 'rowColumns_'.
   std::vector<TypePtr> types_;
   std::vector<TypeKind> typeKinds_;
-  const bool isJoinBuild_;
   int32_t nextOffset_ = 0;
   // Bit position of null bit  in the row. 0 if no null flag. Order is keys,
   // accumulators, dependent.
@@ -1134,10 +1160,7 @@ class RowContainer {
   uint64_t numFreeRows_ = 0;
 
   memory::AllocationPool rows_;
-  HashStringAllocator stringAllocator_;
-
-  // Partition number for each row. Used only in parallel hash join build.
-  std::unique_ptr<RowPartitions> partitions_;
+  std::shared_ptr<HashStringAllocator> stringAllocator_;
 
   int alignment_ = 1;
 };
diff --git a/velox/exec/RowNumber.cpp b/velox/exec/RowNumber.cpp
index 98c7f12179a5..f6da535982d9 100644
--- a/velox/exec/RowNumber.cpp
+++ b/velox/exec/RowNumber.cpp
@@ -27,7 +27,8 @@ RowNumber::RowNumber(
           operatorId,
           rowNumberNode->id(),
           "RowNumber"),
-      limit_{rowNumberNode->limit()} {
+      limit_{rowNumberNode->limit()},
+      generateRowNumber_{rowNumberNode->generateRowNumber()} {
   const auto& inputType = rowNumberNode->sources()[0]->outputType();
   const auto& keys = rowNumberNode->partitionKeys();
   const auto numKeys = keys.size();
@@ -53,8 +54,10 @@ RowNumber::RowNumber(
     identityProjections_.emplace_back(i, i);
   }
 
-  resultProjections_.emplace_back(0, inputType->size());
-  results_.resize(1);
+  if (generateRowNumber_) {
+    resultProjections_.emplace_back(0, inputType->size());
+    results_.resize(1);
+  }
 }
 
 void RowNumber::addInput(RowVectorPtr input) {
@@ -104,8 +107,11 @@ RowVectorPtr RowNumber::getOutput() {
     rawMapping = mapping->asMutable<vector_size_t>();
   }
 
-  // Compute row numbers.
-  auto& rowNumbers = getOrCreateRowNumberVector(numInput);
+  // Compute row numbers if needed.
+  FlatVector<int64_t>* rowNumbers = nullptr;
+  if (generateRowNumber_) {
+    rowNumbers = &getOrCreateRowNumberVector(numInput);
+  }
 
   for (auto i = 0; i < numInput; ++i) {
     auto* partition = lookup_->hits[i];
@@ -119,7 +125,9 @@ RowVectorPtr RowNumber::getOutput() {
       rawMapping[index++] = i;
     }
 
-    rowNumbers.set(i, rowNumber);
+    if (generateRowNumber_) {
+      rowNumbers->set(i, rowNumber);
+    }
     setNumRows(partition, rowNumber);
   }
 
@@ -155,10 +163,11 @@ RowVectorPtr RowNumber::getOutputForSinglePartition() {
     numOutput = numInput;
   }
 
-  auto& rowNumbers = getOrCreateRowNumberVector(numOutput);
-
-  for (auto i = 0; i < numOutput; ++i) {
-    rowNumbers.set(i, ++numTotalInput_);
+  if (generateRowNumber_) {
+    auto& rowNumbers = getOrCreateRowNumberVector(numOutput);
+    for (auto i = 0; i < numOutput; ++i) {
+      rowNumbers.set(i, ++numTotalInput_);
+    }
   }
 
   auto output = fillOutput(numOutput, nullptr);
diff --git a/velox/exec/RowNumber.h b/velox/exec/RowNumber.h
index f42ef6f97441..7ecf963b69cf 100644
--- a/velox/exec/RowNumber.h
+++ b/velox/exec/RowNumber.h
@@ -53,6 +53,7 @@ class RowNumber : public Operator {
   FlatVector<int64_t>& getOrCreateRowNumberVector(vector_size_t size);
 
   const std::optional<int32_t> limit_;
+  const bool generateRowNumber_;
 
   /// Hash table to store number of rows seen so far per partition. Not used if
   /// there are no partitioning keys.
diff --git a/velox/exec/SetAccumulator.h b/velox/exec/SetAccumulator.h
index bb373ba06ab6..48d81bc2ef66 100644
--- a/velox/exec/SetAccumulator.h
+++ b/velox/exec/SetAccumulator.h
@@ -89,7 +89,10 @@ struct SetAccumulator {
     return index - offset;
   }
 
-  void free(HashStringAllocator& allocator) {}
+  void free(HashStringAllocator& allocator) {
+    using UT = decltype(uniqueValues);
+    uniqueValues.~UT();
+  }
 };
 
 /// Maintains a set of unique strings.
@@ -146,6 +149,8 @@ struct StringViewSetAccumulator {
 
   void free(HashStringAllocator& allocator) {
     strings.free(allocator);
+    using Base = decltype(base);
+    base.~Base();
   }
 };
 
@@ -214,6 +219,8 @@ struct ComplexTypeSetAccumulator {
 
   void free(HashStringAllocator& allocator) {
     values.free(allocator);
+    using Base = decltype(base);
+    base.~Base();
   }
 };
 
diff --git a/velox/exec/SortedAggregations.cpp b/velox/exec/SortedAggregations.cpp
index b20537d3bd84..3bc43b460232 100644
--- a/velox/exec/SortedAggregations.cpp
+++ b/velox/exec/SortedAggregations.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 #include "velox/exec/SortedAggregations.h"
+#include "velox/common/base/RawVector.h"
 
 namespace facebook::velox::exec {
 
@@ -270,6 +271,7 @@ void SortedAggregations::extractValues(
     const RowVectorPtr& result) {
   // TODO Identify aggregates with same order by and sort once.
 
+  raw_vector<int32_t> temp;
   SelectivityVector rows;
   for (auto i = 0; i < aggregates_.size(); ++i) {
     const auto& aggregate = *aggregates_[i];
@@ -300,6 +302,11 @@ void SortedAggregations::extractValues(
     // Release memory back to HashStringAllocator to allow next aggregate to
     // re-use it.
     aggregate.function->destroy(groups);
+    // Overwrite empty groups over the destructed groups to keep the container
+    // in a well formed state.
+    aggregate.function->initializeNewGroups(
+        groups.data(),
+        folly::Range<const int32_t*>(iota(groups.size(), temp), groups.size()));
   }
 }
 
diff --git a/velox/exec/Spill.cpp b/velox/exec/Spill.cpp
index 67b19d0bcb9c..253199891951 100644
--- a/velox/exec/Spill.cpp
+++ b/velox/exec/Spill.cpp
@@ -27,7 +27,9 @@ namespace facebook::velox::exec {
 // nanosecond precision, we use this serde option to ensure the serializer
 // preserves precision.
 static const serializer::presto::PrestoVectorSerde::PrestoOptions
-    kDefaultSerdeOptions(/*useLosslessTimestamp*/ true);
+    kDefaultSerdeOptions(
+        /*useLosslessTimestamp*/ true,
+        common::CompressionKind::CompressionKind_NONE);
 
 std::atomic<int32_t> SpillFile::ordinalCounter_;
 
diff --git a/velox/exec/StreamingAggregation.cpp b/velox/exec/StreamingAggregation.cpp
index 0a5561ec5918..4d160839f91b 100644
--- a/velox/exec/StreamingAggregation.cpp
+++ b/velox/exec/StreamingAggregation.cpp
@@ -123,11 +123,7 @@ StreamingAggregation::StreamingAggregation(
 }
 
 void StreamingAggregation::close() {
-  for (int32_t i = 0; i < aggregates_.size(); ++i) {
-    if (aggregates_[i]->accumulatorUsesExternalMemory()) {
-      aggregates_[i]->destroy(folly::Range(groups_.data(), groups_.size()));
-    }
-  }
+  rows_->clear();
   Operator::close();
 }
 
diff --git a/velox/exec/TaskStats.h b/velox/exec/TaskStats.h
index b6aa2298a884..ac64dd4d769f 100644
--- a/velox/exec/TaskStats.h
+++ b/velox/exec/TaskStats.h
@@ -88,9 +88,9 @@ struct TaskStats {
 
   /// Output buffer's memory utilization ratio measured as
   /// current buffer usage / max buffer size
-  double outputBufferUtilization;
+  double outputBufferUtilization{0};
   /// Indicates if output buffer is over-utilized and thus blocks the producers.
-  bool outputBufferOverutilized;
+  bool outputBufferOverutilized{false};
 };
 
 } // namespace facebook::velox::exec
diff --git a/velox/exec/tests/AggregationFuzzer.cpp b/velox/exec/tests/AggregationFuzzer.cpp
index 2e86d9fc079d..4f25173ff016 100644
--- a/velox/exec/tests/AggregationFuzzer.cpp
+++ b/velox/exec/tests/AggregationFuzzer.cpp
@@ -64,6 +64,11 @@ DEFINE_string(
     "Directory path for persistence of data and SQL when fuzzer fails for "
     "future reproduction. Empty string disables this feature.");
 
+DEFINE_bool(
+    enable_window_duck_verification,
+    false,
+    "When true, the results of the window aggregation will be compared to duckdb results");
+
 DEFINE_bool(
     persist_and_run_once,
     false,
@@ -164,7 +169,8 @@ class AggregationFuzzer {
       const std::vector<std::string>& sortingKeys,
       const std::vector<std::string>& aggregates,
       const std::vector<RowVectorPtr>& input,
-      bool customVerification);
+      bool customVerification,
+      bool enableWindowDuckVerification);
 
   std::optional<MaterializedRowMultiset> computeDuckWindow(
       const std::vector<std::string>& partitionKeys,
@@ -586,7 +592,12 @@ void AggregationFuzzer::go() {
         auto input = generateInputDataWithRowNumber(argNames, argTypes);
 
         verifyWindow(
-            partitionKeys, sortingKeys, {call}, input, customVerification);
+            partitionKeys,
+            sortingKeys,
+            {call},
+            input,
+            customVerification,
+            FLAGS_enable_window_duck_verification);
       } else {
         // 20% of times use mask.
         std::vector<std::string> masks;
@@ -958,7 +969,8 @@ void AggregationFuzzer::verifyWindow(
     const std::vector<std::string>& sortingKeys,
     const std::vector<std::string>& aggregates,
     const std::vector<RowVectorPtr>& input,
-    bool customVerification) {
+    bool customVerification,
+    bool enableWindowDuckVerification) {
   std::stringstream frame;
   if (!partitionKeys.empty()) {
     frame << "partition by " << folly::join(", ", partitionKeys);
@@ -981,7 +993,8 @@ void AggregationFuzzer::verifyWindow(
       ++stats_.numFailed;
     }
 
-    if (!customVerification && resultOrError.result) {
+    if (!customVerification && resultOrError.result &&
+        enableWindowDuckVerification) {
       if (auto expectedResult = computeDuckWindow(
               partitionKeys, sortingKeys, aggregates, input, plan)) {
         ++stats_.numDuckVerified;
diff --git a/velox/exec/tests/PartitionedOutputBufferManagerTest.cpp b/velox/exec/tests/PartitionedOutputBufferManagerTest.cpp
index eb70babd1c4c..af7e725a9db6 100644
--- a/velox/exec/tests/PartitionedOutputBufferManagerTest.cpp
+++ b/velox/exec/tests/PartitionedOutputBufferManagerTest.cpp
@@ -149,22 +149,22 @@ class PartitionedOutputBufferManagerTest : public testing::Test {
          &receivedData](
             std::vector<std::unique_ptr<folly::IOBuf>> pages,
             int64_t inSequence) {
-          EXPECT_FALSE(receivedData) << "for destination " << destination;
-          EXPECT_EQ(pages.size(), expectedGroups)
+          ASSERT_FALSE(receivedData) << "for destination " << destination;
+          ASSERT_EQ(pages.size(), expectedGroups)
               << "for destination " << destination;
           for (int i = 0; i < pages.size(); ++i) {
             if (i == pages.size() - 1) {
-              EXPECT_EQ(expectedEndMarker, pages[i] == nullptr)
+              ASSERT_EQ(expectedEndMarker, pages[i] == nullptr)
                   << "for destination " << destination;
             } else {
-              EXPECT_TRUE(pages[i] != nullptr)
+              ASSERT_TRUE(pages[i] != nullptr)
                   << "for destination " << destination;
             }
           }
-          EXPECT_EQ(inSequence, sequence) << "for destination " << destination;
+          ASSERT_EQ(inSequence, sequence) << "for destination " << destination;
           receivedData = true;
         }));
-    EXPECT_TRUE(receivedData) << "for destination " << destination;
+    ASSERT_TRUE(receivedData) << "for destination " << destination;
   }
 
   void fetchOne(
@@ -562,20 +562,19 @@ TEST_F(PartitionedOutputBufferManagerTest, basicPartitioned) {
       taskId, rowType_, PartitionedOutputNode::Kind::kPartitioned, 5, 1);
   verifyOutputBuffer(task, OutputBufferStatus::kInitiated);
 
-  // Partitioned output buffer doesn't allow to update output buffers once
-  // created.
-  VELOX_ASSERT_THROW(
-      bufferManager_->updateOutputBuffers(taskId, 5 + 1, true),
-      "updateOutputBuffers is not supported on PARTITIONED output buffer");
+  // Duplicateb update buffers with the same settings are allowed and ignored.
+  ASSERT_TRUE(bufferManager_->updateOutputBuffers(taskId, 5, true));
+  ASSERT_FALSE(bufferManager_->isFinished(taskId));
+  // Partitioned output buffer doesn't allow to update with different number of
+  // output buffers once created.
   VELOX_ASSERT_THROW(
-      bufferManager_->updateOutputBuffers(taskId, 5 + 1, false),
-      "updateOutputBuffers is not supported on PARTITIONED output buffer");
+      bufferManager_->updateOutputBuffers(taskId, 5 + 1, true), "");
+  // Partitioned output buffer doesn't expect more output buffers once created.
+  VELOX_ASSERT_THROW(bufferManager_->updateOutputBuffers(taskId, 5, false), "");
   VELOX_ASSERT_THROW(
-      bufferManager_->updateOutputBuffers(taskId, 5 - 1, true),
-      "updateOutputBuffers is not supported on PARTITIONED output buffer");
+      bufferManager_->updateOutputBuffers(taskId, 5 - 1, true), "");
   VELOX_ASSERT_THROW(
-      bufferManager_->updateOutputBuffers(taskId, 5 - 1, false),
-      "updateOutputBuffers is not supported on PARTITIONED output buffer");
+      bufferManager_->updateOutputBuffers(taskId, 5 - 1, false), "");
 
   // - enqueue one group per destination
   // - fetch and ask one group per destination
@@ -761,8 +760,10 @@ TEST_F(PartitionedOutputBufferManagerTest, basicArbitrary) {
   fetchOneAndAck(taskId, numDestinations - 1, 0);
   ackedSeqbyDestination[numDestinations - 1] = 1;
 
-  bufferManager_->updateOutputBuffers(taskId, numDestinations, true);
-  VELOX_ASSERT_THROW(fetchOneAndAck(taskId, numDestinations, 0), "");
+  bufferManager_->updateOutputBuffers(taskId, numDestinations - 1, false);
+  VELOX_ASSERT_THROW(
+      fetchOneAndAck(taskId, numDestinations - 1, 0),
+      "(0 vs. 1) Get received for an already acknowledged item");
 
   receivedData = false;
   registerForData(taskId, numDestinations - 2, 0, 1, receivedData);
@@ -772,13 +773,18 @@ TEST_F(PartitionedOutputBufferManagerTest, basicArbitrary) {
   ackedSeqbyDestination[numDestinations - 2] = 1;
 
   noMoreData(taskId);
+  EXPECT_FALSE(bufferManager_->isFinished(taskId));
   EXPECT_TRUE(task->isRunning());
   for (int i = 0; i < numDestinations; ++i) {
     fetchEndMarker(taskId, i, ackedSeqbyDestination[i]);
   }
   EXPECT_TRUE(bufferManager_->isFinished(taskId));
-
   EXPECT_FALSE(task->isRunning());
+
+  // NOTE: arbitrary buffer finish condition doesn't depend on no more
+  // (destination )buffers update flag.
+  bufferManager_->updateOutputBuffers(taskId, numDestinations, true);
+
   EXPECT_TRUE(bufferManager_->isFinished(taskId));
   bufferManager_->removeTask(taskId);
   EXPECT_TRUE(task->isFinished());
@@ -919,8 +925,7 @@ TEST_P(AllPartitionedOutputBufferManagerTest, outputBufferUtilization) {
   const auto destination = 0;
   auto task = initializeTask(taskId, rowType_, kind_, 1, 1);
   verifyOutputBuffer(task, OutputBufferStatus::kInitiated);
-  if (kind_ !=
-      facebook::velox::core::PartitionedOutputNode::Kind::kPartitioned) {
+  if (kind_ == facebook::velox::core::PartitionedOutputNode::Kind::kBroadcast) {
     bufferManager_->updateOutputBuffers(taskId, destination, true);
   }
 
diff --git a/velox/exec/tests/PlanNodeSerdeTest.cpp b/velox/exec/tests/PlanNodeSerdeTest.cpp
index 5c6243bb6d82..c8a9ce7a84f7 100644
--- a/velox/exec/tests/PlanNodeSerdeTest.cpp
+++ b/velox/exec/tests/PlanNodeSerdeTest.cpp
@@ -413,6 +413,7 @@ TEST_F(PlanNodeSerdeTest, window) {
 }
 
 TEST_F(PlanNodeSerdeTest, rowNumber) {
+  // Test with emitting the row number.
   auto plan = PlanBuilder().values({data_}).rowNumber({}).planNode();
   testSerde(plan);
 
@@ -420,6 +421,26 @@ TEST_F(PlanNodeSerdeTest, rowNumber) {
   testSerde(plan);
 
   plan = PlanBuilder().values({data_}).rowNumber({"c1", "c2"}, 10).planNode();
+  testSerde(plan);
+
+  // Test without emitting the row number.
+  plan = PlanBuilder()
+             .values({data_})
+             .rowNumber({}, std::nullopt, false)
+             .planNode();
+  testSerde(plan);
+
+  plan = PlanBuilder()
+             .values({data_})
+             .rowNumber({"c2", "c0"}, std::nullopt, false)
+             .planNode();
+  testSerde(plan);
+
+  plan = PlanBuilder()
+             .values({data_})
+             .rowNumber({"c1", "c2"}, 10, false)
+             .planNode();
+  testSerde(plan);
 }
 
 TEST_F(PlanNodeSerdeTest, scan) {
diff --git a/velox/exec/tests/PlanNodeToStringTest.cpp b/velox/exec/tests/PlanNodeToStringTest.cpp
index cd6e34fc0ec2..baa7b4743181 100644
--- a/velox/exec/tests/PlanNodeToStringTest.cpp
+++ b/velox/exec/tests/PlanNodeToStringTest.cpp
@@ -701,6 +701,7 @@ TEST_F(PlanNodeToStringTest, window) {
 }
 
 TEST_F(PlanNodeToStringTest, rowNumber) {
+  // Emit row number.
   auto plan =
       PlanBuilder().tableScan(ROW({"a"}, {VARCHAR()})).rowNumber({}).planNode();
 
@@ -709,6 +710,16 @@ TEST_F(PlanNodeToStringTest, rowNumber) {
       "-- RowNumber[] -> a:VARCHAR, row_number:BIGINT\n",
       plan->toString(true, false));
 
+  // Dont' emit row number.
+  plan = PlanBuilder()
+             .tableScan(ROW({"a"}, {VARCHAR()}))
+             .rowNumber({}, std::nullopt, false)
+             .planNode();
+
+  ASSERT_EQ("-- RowNumber\n", plan->toString());
+  ASSERT_EQ("-- RowNumber[] -> a:VARCHAR\n", plan->toString(true, false));
+
+  // Emit row number.
   plan = PlanBuilder()
              .tableScan(ROW({"a", "b"}, {BIGINT(), VARCHAR()}))
              .rowNumber({"a", "b"})
@@ -719,6 +730,18 @@ TEST_F(PlanNodeToStringTest, rowNumber) {
       "-- RowNumber[partition by (a, b)] -> a:BIGINT, b:VARCHAR, row_number:BIGINT\n",
       plan->toString(true, false));
 
+  // Don't emit row number.
+  plan = PlanBuilder()
+             .tableScan(ROW({"a", "b"}, {BIGINT(), VARCHAR()}))
+             .rowNumber({"a", "b"}, std::nullopt, false)
+             .planNode();
+
+  ASSERT_EQ("-- RowNumber\n", plan->toString());
+  ASSERT_EQ(
+      "-- RowNumber[partition by (a, b)] -> a:BIGINT, b:VARCHAR\n",
+      plan->toString(true, false));
+
+  // Emit row number.
   plan = PlanBuilder()
              .tableScan(ROW({"a", "b"}, {BIGINT(), VARCHAR()}))
              .rowNumber({"b"}, 10)
@@ -728,6 +751,17 @@ TEST_F(PlanNodeToStringTest, rowNumber) {
   ASSERT_EQ(
       "-- RowNumber[partition by (b) limit 10] -> a:BIGINT, b:VARCHAR, row_number:BIGINT\n",
       plan->toString(true, false));
+
+  // Don't emit row number.
+  plan = PlanBuilder()
+             .tableScan(ROW({"a", "b"}, {BIGINT(), VARCHAR()}))
+             .rowNumber({"b"}, 10, false)
+             .planNode();
+
+  ASSERT_EQ("-- RowNumber\n", plan->toString());
+  ASSERT_EQ(
+      "-- RowNumber[partition by (b) limit 10] -> a:BIGINT, b:VARCHAR\n",
+      plan->toString(true, false));
 }
 
 TEST_F(PlanNodeToStringTest, topNRowNumber) {
diff --git a/velox/exec/tests/RowContainerTest.cpp b/velox/exec/tests/RowContainerTest.cpp
index 1697e9efdbe2..868ac9427325 100644
--- a/velox/exec/tests/RowContainerTest.cpp
+++ b/velox/exec/tests/RowContainerTest.cpp
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include "velox/common/base/tests/GTestUtils.h"
 #include "velox/exec/Aggregate.h"
 #include "velox/exec/VectorHasher.h"
 #include "velox/exec/tests/utils/RowContainerTestBase.h"
@@ -957,8 +958,8 @@ TEST_F(RowContainerTest, compareDouble) {
 }
 
 TEST_F(RowContainerTest, partition) {
-  // We assign an arbitrary partition number to each row and iterate
-  // over the rows a partition at a time.
+  // We assign an arbitrary partition number to each row and iterate over the
+  // rows a partition at a time.
   constexpr int32_t kNumRows = 100019;
   constexpr uint8_t kNumPartitions = 16;
   auto batch = makeDataset(
@@ -986,7 +987,32 @@ TEST_F(RowContainerTest, partition) {
     }
   }
 
-  auto& partitions = data->partitions();
+  // Expect throws before we get row partitions from this row container.
+  for (auto partition = 0; partition < kNumPartitions; ++partition) {
+    char* dummyBuffer;
+    RowPartitions dummyRowPartitions(data->numRows(), *pool_);
+    VELOX_ASSERT_THROW(
+        data->listPartitionRows(
+            iter,
+            partition,
+            1'000, /* maxRows */
+            dummyRowPartitions,
+            &dummyBuffer),
+        "Can't list partition rows from a mutable row container");
+  }
+
+  auto partitions = data->createRowPartitions(*pool_);
+  ASSERT_FALSE(data->testingMutable());
+  // Verify we can only get row partitions once from a row container.
+  VELOX_ASSERT_THROW(
+      data->createRowPartitions(*pool_),
+      "Can only create RowPartitions once from a row container");
+  // Verify we can't insert new row into a immutable row container.
+#ifndef NDEBUG
+  VELOX_ASSERT_THROW(
+      data->newRow(), "Can't add row into an immutable row container")
+#endif
+
   std::vector<uint8_t> rowPartitions(kNumRows);
   // Assign a partition to each row based on  modulo of first column.
   std::vector<std::vector<char*>> partitionRows(kNumPartitions);
@@ -997,7 +1023,7 @@ TEST_F(RowContainerTest, partition) {
     rowPartitions[i] = partition;
     partitionRows[partition].push_back(rows[i]);
   }
-  partitions.appendPartitions(
+  partitions->appendPartitions(
       folly::Range<const uint8_t*>(rowPartitions.data(), kNumRows));
   for (auto partition = 0; partition < kNumPartitions; ++partition) {
     std::vector<char*> result(partitionRows[partition].size() + 10);
@@ -1006,7 +1032,11 @@ TEST_F(RowContainerTest, partition) {
     int32_t resultBatch = 1;
     // Read the rows in multiple batches.
     while (auto numResults = data->listPartitionRows(
-               iter, partition, resultBatch, result.data() + numFound)) {
+               iter,
+               partition,
+               resultBatch,
+               *partitions,
+               result.data() + numFound)) {
       numFound += numResults;
       resultBatch += 13;
     }
@@ -1016,6 +1046,17 @@ TEST_F(RowContainerTest, partition) {
   }
 }
 
+TEST_F(RowContainerTest, partitionWithEmptyRowContainer) {
+  auto rowType = ROW(
+      {{"int_val", INTEGER()},
+       {"long_val", BIGINT()},
+       {"string_val", VARCHAR()}});
+  auto rowContainer =
+      std::make_unique<RowContainer>(rowType->children(), pool_.get());
+  auto partitions = rowContainer->createRowPartitions(*pool_);
+  ASSERT_EQ(partitions->size(), 0);
+}
+
 TEST_F(RowContainerTest, probedFlag) {
   auto rowContainer = std::make_unique<RowContainer>(
       std::vector<TypePtr>{BIGINT()}, // keyTypes
diff --git a/velox/exec/tests/RowNumberTest.cpp b/velox/exec/tests/RowNumberTest.cpp
index 700cd260e78e..22fee689601b 100644
--- a/velox/exec/tests/RowNumberTest.cpp
+++ b/velox/exec/tests/RowNumberTest.cpp
@@ -28,11 +28,21 @@ TEST_F(RowNumberTest, basic) {
 
   createDuckDbTable({data});
 
-  // No limit.
+  // No limit, emit row numbers.
   auto plan = PlanBuilder().values({data}).rowNumber({"c0"}).planNode();
   assertQuery(plan, "SELECT *, row_number() over (partition by c0) FROM tmp");
 
+  // No limit, don't emit row numbers.
+  plan = PlanBuilder()
+             .values({data})
+             .rowNumber({"c0"}, std::nullopt, false)
+             .planNode();
+  assertQuery(
+      plan,
+      "SELECT c0, c1 FROM (SELECT *, row_number() over (partition by c0) as rn FROM tmp)");
+
   auto testLimit = [&](int32_t limit) {
+    // Limit, emit row numbers.
     auto plan =
         PlanBuilder().values({data}).rowNumber({"c0"}, limit).planNode();
     assertQuery(
@@ -41,6 +51,16 @@ TEST_F(RowNumberTest, basic) {
             "SELECT * FROM (SELECT *, row_number() over (partition by c0) as rn FROM tmp) "
             "WHERE rn <= {}",
             limit));
+
+    // Limit, don't emit row numbers.
+    plan =
+        PlanBuilder().values({data}).rowNumber({"c0"}, limit, false).planNode();
+    assertQuery(
+        plan,
+        fmt::format(
+            "SELECT c0, c1 FROM (SELECT *, row_number() over (partition by c0) as rn FROM tmp) "
+            "WHERE rn <= {}",
+            limit));
   };
 
   testLimit(1);
@@ -55,11 +75,20 @@ TEST_F(RowNumberTest, noPartitionKeys) {
 
   createDuckDbTable({data, data});
 
-  // No limit.
+  // No limit, emit row numbers.
   auto plan = PlanBuilder().values({data, data}).rowNumber({}).planNode();
   assertQuery(plan, "SELECT *, row_number() over () FROM tmp");
 
+  // No limit, don't emit row numbers.
+  plan = PlanBuilder()
+             .values({data, data})
+             .rowNumber({}, std::nullopt, false)
+             .planNode();
+  assertQuery(
+      plan, "SELECT c0 FROM (SELECT *, row_number() over () as rn FROM tmp)");
+
   auto testLimit = [&](int32_t limit) {
+    // Emit row numbers.
     auto plan =
         PlanBuilder().values({data, data}).rowNumber({}, limit).planNode();
     assertQuery(
@@ -68,6 +97,18 @@ TEST_F(RowNumberTest, noPartitionKeys) {
             "SELECT * FROM (SELECT *, row_number() over () as rn FROM tmp) "
             "WHERE rn <= {}",
             limit));
+
+    // Don't emit row numbers.
+    plan = PlanBuilder()
+               .values({data, data})
+               .rowNumber({}, limit, false)
+               .planNode();
+    assertQuery(
+        plan,
+        fmt::format(
+            "SELECT c0 FROM (SELECT *, row_number() over () as rn FROM tmp) "
+            "WHERE rn <= {}",
+            limit));
   };
 
   testLimit(1);
@@ -82,11 +123,21 @@ TEST_F(RowNumberTest, largeInput) {
 
   createDuckDbTable({data, data});
 
-  // No limit.
+  // No limit, emit row numbers.
   auto plan = PlanBuilder().values({data, data}).rowNumber({"c0"}).planNode();
   assertQuery(plan, "SELECT *, row_number() over (partition by c0) FROM tmp");
 
+  // No limit, don't emit row numbers.
+  plan = PlanBuilder()
+             .values({data, data})
+             .rowNumber({"c0"}, std::nullopt, false)
+             .planNode();
+  assertQuery(
+      plan,
+      "SELECT c0, c1 FROM (SELECT *, row_number() over (partition by c0) as rn FROM tmp)");
+
   auto testLimit = [&](int32_t limit) {
+    // Emit row numbers.
     auto plan =
         PlanBuilder().values({data, data}).rowNumber({"c0"}, limit).planNode();
     assertQuery(
@@ -95,6 +146,18 @@ TEST_F(RowNumberTest, largeInput) {
             "SELECT * FROM (SELECT *, row_number() over (partition by c0) as rn FROM tmp) "
             "WHERE rn <= {}",
             limit));
+
+    // Don't emit row numbers.
+    plan = PlanBuilder()
+               .values({data, data})
+               .rowNumber({"c0"}, limit, false)
+               .planNode();
+    assertQuery(
+        plan,
+        fmt::format(
+            "SELECT c0, c1 FROM (SELECT *, row_number() over (partition by c0) as rn FROM tmp) "
+            "WHERE rn <= {}",
+            limit));
   };
 
   testLimit(1);
diff --git a/velox/exec/tests/StreamingAggregationTest.cpp b/velox/exec/tests/StreamingAggregationTest.cpp
index e880a9540a31..36858e523f87 100644
--- a/velox/exec/tests/StreamingAggregationTest.cpp
+++ b/velox/exec/tests/StreamingAggregationTest.cpp
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include "velox/exec/tests/utils/AssertQueryBuilder.h"
 #include "velox/exec/tests/utils/OperatorTestBase.h"
 #include "velox/exec/tests/utils/PlanBuilder.h"
 #include "velox/exec/tests/utils/SumNonPODAggregate.h"
@@ -27,18 +28,6 @@ class StreamingAggregationTest : public OperatorTestBase {
     registerSumNonPODAggregate("sumnonpod", 64);
   }
 
-  CursorParameters makeCursorParameters(
-      const std::shared_ptr<const core::PlanNode>& planNode,
-      uint32_t preferredOutputBatchSize) {
-    CursorParameters params;
-    params.planNode = planNode;
-    params.queryCtx = std::make_shared<core::QueryCtx>(executor_.get());
-    params.queryCtx->testingOverrideConfigUnsafe(
-        {{core::QueryConfig::kPreferredOutputBatchRows,
-          std::to_string(preferredOutputBatchSize)}});
-    return params;
-  }
-
   void testAggregation(
       const std::vector<VectorPtr>& keys,
       uint32_t outputBatchSize = 1'024) {
@@ -67,11 +56,14 @@ class StreamingAggregationTest : public OperatorTestBase {
                     .finalAggregation()
                     .planNode();
 
-    assertQuery(
-        makeCursorParameters(plan, outputBatchSize),
-        "SELECT c0, count(1), min(c1), max(c1), sum(c1), sum(1)"
-        "     , approx_quantile(c1, 0.95) "
-        "FROM tmp GROUP BY 1");
+    AssertQueryBuilder(plan, duckDbQueryRunner_)
+        .config(
+            core::QueryConfig::kPreferredOutputBatchRows,
+            std::to_string(outputBatchSize))
+        .assertResults(
+            "SELECT c0, count(1), min(c1), max(c1), sum(c1), sum(1)"
+            "     , approx_quantile(c1, 0.95) "
+            "FROM tmp GROUP BY 1");
 
     EXPECT_EQ(NonPODInt64::constructed, NonPODInt64::destructed);
 
@@ -85,9 +77,12 @@ class StreamingAggregationTest : public OperatorTestBase {
             .finalAggregation()
             .planNode();
 
-    assertQuery(
-        makeCursorParameters(plan, outputBatchSize),
-        "SELECT c0, count(1), min(c1), max(c1), sum(c1), sum(1) FROM tmp GROUP BY 1");
+    AssertQueryBuilder(plan, duckDbQueryRunner_)
+        .config(
+            core::QueryConfig::kPreferredOutputBatchRows,
+            std::to_string(outputBatchSize))
+        .assertResults(
+            "SELECT c0, count(1), min(c1), max(c1), sum(c1), sum(1) FROM tmp GROUP BY 1");
 
     EXPECT_EQ(NonPODInt64::constructed, NonPODInt64::destructed);
 
@@ -103,11 +98,14 @@ class StreamingAggregationTest : public OperatorTestBase {
                .finalAggregation()
                .planNode();
 
-    assertQuery(
-        makeCursorParameters(plan, outputBatchSize),
-        "SELECT c0, count(1), min(c1) filter (where c1 % 7 = 0), "
-        "max(c1) filter (where c1 % 11 = 0), sum(c1) filter (where c1 % 7 = 0) "
-        "FROM tmp GROUP BY 1");
+    AssertQueryBuilder(plan, duckDbQueryRunner_)
+        .config(
+            core::QueryConfig::kPreferredOutputBatchRows,
+            std::to_string(outputBatchSize))
+        .assertResults(
+            "SELECT c0, count(1), min(c1) filter (where c1 % 7 = 0), "
+            "max(c1) filter (where c1 % 11 = 0), sum(c1) filter (where c1 % 7 = 0) "
+            "FROM tmp GROUP BY 1");
   }
 
   std::vector<RowVectorPtr> addPayload(const std::vector<RowVectorPtr>& keys) {
@@ -168,12 +166,23 @@ class StreamingAggregationTest : public OperatorTestBase {
       keySql << ", c" << i;
     }
 
-    assertQuery(
-        makeCursorParameters(plan, outputBatchSize),
-        fmt::format(
-            "SELECT {}, count(1), min(c1), max(c1), sum(c1), sum(1) FROM tmp GROUP BY {}",
-            keySql.str(),
-            keySql.str()));
+    const auto sql = fmt::format(
+        "SELECT {}, count(1), min(c1), max(c1), sum(c1), sum(1) FROM tmp GROUP BY {}",
+        keySql.str(),
+        keySql.str());
+
+    AssertQueryBuilder(plan, duckDbQueryRunner_)
+        .config(
+            core::QueryConfig::kPreferredOutputBatchRows,
+            std::to_string(outputBatchSize))
+        .assertResults(sql);
+
+    EXPECT_EQ(NonPODInt64::constructed, NonPODInt64::destructed);
+
+    // Force partial aggregation flush after every batch of input.
+    AssertQueryBuilder(plan, duckDbQueryRunner_)
+        .config(core::QueryConfig::kMaxPartialAggregationMemory, "0")
+        .assertResults(sql);
 
     EXPECT_EQ(NonPODInt64::constructed, NonPODInt64::destructed);
   }
diff --git a/velox/exec/tests/TableScanTest.cpp b/velox/exec/tests/TableScanTest.cpp
index e6df3e94b56c..78164dcdc3a8 100644
--- a/velox/exec/tests/TableScanTest.cpp
+++ b/velox/exec/tests/TableScanTest.cpp
@@ -2492,8 +2492,7 @@ TEST_F(TableScanTest, addSplitsToFailedTask) {
 }
 
 TEST_F(TableScanTest, errorInLoadLazy) {
-  auto cache = dynamic_cast<cache::AsyncDataCache*>(
-      memory::MemoryAllocator::getInstance());
+  auto cache = cache::AsyncDataCache::getInstance();
   VELOX_CHECK_NOT_NULL(cache);
   auto vectors = makeVectors(10, 1'000);
   auto filePath = TempFilePath::create();
diff --git a/velox/exec/tests/ThreadDebugInfoTest.cpp b/velox/exec/tests/ThreadDebugInfoTest.cpp
index eea12376af1d..8721f5ee61ac 100644
--- a/velox/exec/tests/ThreadDebugInfoTest.cpp
+++ b/velox/exec/tests/ThreadDebugInfoTest.cpp
@@ -91,7 +91,7 @@ TEST_F(ThreadDebugInfoDeathTest, withinTheCallingThread) {
       executor_.get(),
       std::unordered_map<std::string, std::string>{},
       std::unordered_map<std::string, std::shared_ptr<Config>>{},
-      memory::MemoryAllocator::getInstance(),
+      cache::AsyncDataCache::getInstance(),
       nullptr,
       nullptr,
       "TaskCursorQuery_0");
diff --git a/velox/exec/tests/utils/AssertQueryBuilder.cpp b/velox/exec/tests/utils/AssertQueryBuilder.cpp
index 74c52d72696b..0e3146eed0fb 100644
--- a/velox/exec/tests/utils/AssertQueryBuilder.cpp
+++ b/velox/exec/tests/utils/AssertQueryBuilder.cpp
@@ -216,7 +216,7 @@ AssertQueryBuilder::readCursor() {
           executor_.get(),
           std::unordered_map<std::string, std::string>{},
           std::unordered_map<std::string, std::shared_ptr<Config>>{},
-          memory::MemoryAllocator::getInstance(),
+          cache::AsyncDataCache::getInstance(),
           nullptr,
           nullptr,
           fmt::format("TaskCursorQuery_{}", cursorQueryId++));
diff --git a/velox/exec/tests/utils/Cursor.cpp b/velox/exec/tests/utils/Cursor.cpp
index 59e0dad5f942..0495a15a3679 100644
--- a/velox/exec/tests/utils/Cursor.cpp
+++ b/velox/exec/tests/utils/Cursor.cpp
@@ -123,7 +123,7 @@ TaskCursor::TaskCursor(const CursorParameters& params)
         executor_.get(),
         std::unordered_map<std::string, std::string>{},
         std::unordered_map<std::string, std::shared_ptr<Config>>{},
-        memory::MemoryAllocator::getInstance(),
+        cache::AsyncDataCache::getInstance(),
         nullptr,
         nullptr,
         fmt::format("TaskCursorQuery_{}", cursorQueryId++));
diff --git a/velox/exec/tests/utils/OperatorTestBase.cpp b/velox/exec/tests/utils/OperatorTestBase.cpp
index 8f378b70c361..a3f5e0ff4d06 100644
--- a/velox/exec/tests/utils/OperatorTestBase.cpp
+++ b/velox/exec/tests/utils/OperatorTestBase.cpp
@@ -33,9 +33,6 @@ using namespace facebook::velox::common::testutil;
 
 namespace facebook::velox::exec::test {
 
-// static
-std::shared_ptr<cache::AsyncDataCache> OperatorTestBase::asyncDataCache_;
-
 OperatorTestBase::OperatorTestBase() {
   using memory::MemoryAllocator;
   facebook::velox::exec::ExchangeSource::registerFactory();
@@ -51,29 +48,34 @@ OperatorTestBase::~OperatorTestBase() {
   memory::MemoryAllocator::setDefaultInstance(nullptr);
 }
 
+void OperatorTestBase::SetUpTestCase() {
+  functions::prestosql::registerAllScalarFunctions();
+  aggregate::prestosql::registerAllAggregateFunctions();
+  TestValue::enable();
+}
+
 void OperatorTestBase::TearDownTestCase() {
   Task::testingWaitForAllTasksToBeDeleted();
 }
 
 void OperatorTestBase::SetUp() {
-  // Sets the process default MemoryAllocator to an async cache of up
-  // to 4GB backed by a default MemoryAllocator
-  if (!asyncDataCache_) {
-    asyncDataCache_ = std::make_shared<cache::AsyncDataCache>(
-        memory::MemoryAllocator::createDefaultInstance(), 4UL << 30);
-  }
-  memory::MemoryAllocator::setDefaultInstance(asyncDataCache_.get());
   if (!isRegisteredVectorSerde()) {
     this->registerVectorSerde();
   }
   driverExecutor_ = std::make_unique<folly::CPUThreadPoolExecutor>(3);
   ioExecutor_ = std::make_unique<folly::IOThreadPoolExecutor>(3);
+  allocator_ = memory::MemoryAllocator::createDefaultInstance();
+  if (!asyncDataCache_) {
+    asyncDataCache_ = cache::AsyncDataCache::create(allocator_.get());
+    cache::AsyncDataCache::setInstance(asyncDataCache_.get());
+  }
+  memory::MemoryAllocator::setDefaultInstance(allocator_.get());
 }
 
-void OperatorTestBase::SetUpTestCase() {
-  functions::prestosql::registerAllScalarFunctions();
-  aggregate::prestosql::registerAllAggregateFunctions();
-  TestValue::enable();
+void OperatorTestBase::TearDown() {
+  if (asyncDataCache_ != nullptr) {
+    asyncDataCache_->prepareShutdown();
+  }
 }
 
 std::shared_ptr<Task> OperatorTestBase::assertQuery(
diff --git a/velox/exec/tests/utils/OperatorTestBase.h b/velox/exec/tests/utils/OperatorTestBase.h
index e5a40d3f8f4e..7900b94f95aa 100644
--- a/velox/exec/tests/utils/OperatorTestBase.h
+++ b/velox/exec/tests/utils/OperatorTestBase.h
@@ -37,6 +37,8 @@ class OperatorTestBase : public testing::Test,
 
   void SetUp() override;
 
+  void TearDown() override;
+
   /// Allow base classes to register custom vector serde.
   /// By default, registers Presto-compatible serde.
   virtual void registerVectorSerde();
@@ -139,8 +141,11 @@ class OperatorTestBase : public testing::Test,
  protected:
   DuckDbQueryRunner duckDbQueryRunner_;
 
-  // Used as default MappedMemory. Created on first use.
-  static std::shared_ptr<cache::AsyncDataCache> asyncDataCache_;
+  // Used as default MemoryAllocator.
+  std::shared_ptr<memory::MemoryAllocator> allocator_;
+
+  // Used as default AsyncDataCache.
+  std::shared_ptr<cache::AsyncDataCache> asyncDataCache_;
 
   // Used for driver thread execution.
   std::unique_ptr<folly::CPUThreadPoolExecutor> driverExecutor_;
diff --git a/velox/exec/tests/utils/PlanBuilder.cpp b/velox/exec/tests/utils/PlanBuilder.cpp
index 39c94faa9ef7..0f3ce0418805 100644
--- a/velox/exec/tests/utils/PlanBuilder.cpp
+++ b/velox/exec/tests/utils/PlanBuilder.cpp
@@ -848,9 +848,9 @@ PlanBuilder& PlanBuilder::partitionedOutput(
       : extract(planNode_->outputType(), outputLayout);
   planNode_ = std::make_shared<core::PartitionedOutputNode>(
       nextPlanNodeId(),
+      core::PartitionedOutputNode::Kind::kPartitioned,
       exprs(keys),
       numPartitions,
-      core::PartitionedOutputNode::Kind::kPartitioned,
       replicateNullsAndAny,
       std::move(partitionFunctionSpec),
       outputType,
@@ -1424,9 +1424,18 @@ PlanBuilder& PlanBuilder::window(
 
 PlanBuilder& PlanBuilder::rowNumber(
     const std::vector<std::string>& partitionKeys,
-    std::optional<int32_t> limit) {
+    std::optional<int32_t> limit,
+    const bool generateRowNumber) {
+  std::optional<std::string> rowNumberColumnName;
+  if (generateRowNumber) {
+    rowNumberColumnName = "row_number";
+  }
   planNode_ = std::make_shared<core::RowNumberNode>(
-      nextPlanNodeId(), fields(partitionKeys), "row_number", limit, planNode_);
+      nextPlanNodeId(),
+      fields(partitionKeys),
+      rowNumberColumnName,
+      limit,
+      planNode_);
   return *this;
 }
 
diff --git a/velox/exec/tests/utils/PlanBuilder.h b/velox/exec/tests/utils/PlanBuilder.h
index b12b812ca3d1..99b347799f80 100644
--- a/velox/exec/tests/utils/PlanBuilder.h
+++ b/velox/exec/tests/utils/PlanBuilder.h
@@ -722,7 +722,8 @@ class PlanBuilder {
   /// optional limit and no sorting.
   PlanBuilder& rowNumber(
       const std::vector<std::string>& partitionKeys,
-      std::optional<int32_t> limit = std::nullopt);
+      std::optional<int32_t> limit = std::nullopt,
+      bool generateRowNumber = true);
 
   /// Add a TopNRowNumberNode to compute single row_number window function with
   /// a limit applied to sorted partitions.
diff --git a/velox/exec/tests/utils/RowContainerTestBase.h b/velox/exec/tests/utils/RowContainerTestBase.h
index 1b6f5940060a..2d70c5fbf6cf 100644
--- a/velox/exec/tests/utils/RowContainerTestBase.h
+++ b/velox/exec/tests/utils/RowContainerTestBase.h
@@ -54,7 +54,7 @@ class RowContainerTestBase : public testing::Test,
       const std::vector<TypePtr>& keyTypes,
       const std::vector<TypePtr>& dependentTypes,
       bool isJoinBuild = true) {
-    return std::make_unique<RowContainer>(
+    auto container = std::make_unique<RowContainer>(
         keyTypes,
         !isJoinBuild,
         std::vector<Accumulator>{},
@@ -64,6 +64,8 @@ class RowContainerTestBase : public testing::Test,
         true,
         true,
         pool_.get());
+    VELOX_CHECK(container->testingMutable());
+    return container;
   }
 };
 } // namespace facebook::velox::exec::test
diff --git a/velox/expression/CastExpr-inl.h b/velox/expression/CastExpr-inl.h
new file mode 100644
index 000000000000..6883aee47a0b
--- /dev/null
+++ b/velox/expression/CastExpr-inl.h
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "velox/common/base/Exceptions.h"
+#include "velox/core/CoreTypeSystem.h"
+#include "velox/expression/StringWriter.h"
+#include "velox/external/date/tz.h"
+#include "velox/type/Type.h"
+#include "velox/vector/SelectivityVector.h"
+
+namespace facebook::velox::exec {
+namespace {
+
+inline std::string makeErrorMessage(
+    const BaseVector& input,
+    vector_size_t row,
+    const TypePtr& toType,
+    const std::string& details = "") {
+  return fmt::format(
+      "Failed to cast from {} to {}: {}. {}",
+      input.type()->toString(),
+      toType->toString(),
+      input.toString(row),
+      details);
+}
+
+inline std::exception_ptr makeBadCastException(
+    const TypePtr& resultType,
+    const BaseVector& input,
+    vector_size_t row,
+    const std::string& errorDetails) {
+  return std::make_exception_ptr(VeloxUserError(
+      std::current_exception(),
+      makeErrorMessage(input, row, resultType, errorDetails),
+      false));
+};
+
+} // namespace
+
+template <bool adjustForTimeZone>
+void CastExpr::castTimestampToDate(
+    const SelectivityVector& rows,
+    const BaseVector& input,
+    exec::EvalCtx& context,
+    VectorPtr& result,
+    const date::time_zone* timeZone) {
+  auto* resultFlatVector = result->as<FlatVector<int32_t>>();
+  static const int32_t kSecsPerDay{86'400};
+  auto inputVector = input.as<SimpleVector<Timestamp>>();
+  applyToSelectedNoThrowLocal(context, rows, result, [&](int row) {
+    auto input = inputVector->valueAt(row);
+    if constexpr (adjustForTimeZone) {
+      input.toTimezone(*timeZone);
+    }
+    auto seconds = input.getSeconds();
+    if (seconds >= 0 || seconds % kSecsPerDay == 0) {
+      resultFlatVector->set(row, seconds / kSecsPerDay);
+    } else {
+      // For division with negatives, minus 1 to compensate the discarded
+      // fractional part. e.g. -1/86'400 yields 0, yet it should be
+      // considered as -1 day.
+      resultFlatVector->set(row, seconds / kSecsPerDay - 1);
+    }
+  });
+}
+
+template <typename Func>
+void CastExpr::applyToSelectedNoThrowLocal(
+    EvalCtx& context,
+    const SelectivityVector& rows,
+    VectorPtr& result,
+    Func&& func) {
+  if (setNullInResultAtError()) {
+    rows.template applyToSelected([&](auto row) INLINE_LAMBDA {
+      try {
+        func(row);
+      } catch (...) {
+        result->setNull(row, true);
+      }
+    });
+  } else {
+    rows.template applyToSelected([&](auto row) INLINE_LAMBDA {
+      try {
+        func(row);
+      } catch (const VeloxException& e) {
+        // Avoid double throwing.
+        context.setVeloxExceptionError(row, std::current_exception());
+      } catch (const std::exception& e) {
+        context.setError(row, std::current_exception());
+      }
+    });
+  }
+}
+
+/// The per-row level Kernel
+/// @tparam ToKind The cast target type
+/// @tparam FromKind The expression type
+/// @param row The index of the current row
+/// @param input The input vector (of type FromKind)
+/// @param result The output vector (of type ToKind)
+template <TypeKind ToKind, TypeKind FromKind, bool Truncate>
+void CastExpr::applyCastKernel(
+    vector_size_t row,
+    EvalCtx& context,
+    const SimpleVector<typename TypeTraits<FromKind>::NativeType>* input,
+    FlatVector<typename TypeTraits<ToKind>::NativeType>* result) {
+  auto inputRowValue = input->valueAt(row);
+
+  // Optimize empty input strings casting by avoiding throwing exceptions.
+  if constexpr (
+      FromKind == TypeKind::VARCHAR || FromKind == TypeKind::VARBINARY) {
+    if constexpr (
+        TypeTraits<ToKind>::isPrimitiveType &&
+        TypeTraits<ToKind>::isFixedWidth) {
+      if (inputRowValue.size() == 0) {
+        if (setNullInResultAtError()) {
+          result->setNull(row, true);
+        } else {
+          context.setVeloxExceptionError(
+              row,
+              makeBadCastException(
+                  result->type(), *input, row, "Empty string"));
+        }
+        return;
+      }
+    }
+  }
+
+  auto output = util::Converter<ToKind, void, Truncate>::cast(inputRowValue);
+
+  if constexpr (ToKind == TypeKind::VARCHAR || ToKind == TypeKind::VARBINARY) {
+    // Write the result output to the output vector
+    auto writer = exec::StringWriter<>(result, row);
+    writer.copy_from(output);
+    writer.finalize();
+  } else {
+    result->set(row, output);
+  }
+}
+
+template <typename TInput, typename TOutput>
+void CastExpr::applyDecimalCastKernel(
+    const SelectivityVector& rows,
+    const BaseVector& input,
+    exec::EvalCtx& context,
+    const TypePtr& fromType,
+    const TypePtr& toType,
+    VectorPtr& castResult) {
+  auto sourceVector = input.as<SimpleVector<TInput>>();
+  auto castResultRawBuffer =
+      castResult->asUnchecked<FlatVector<TOutput>>()->mutableRawValues();
+  const auto& fromPrecisionScale = getDecimalPrecisionScale(*fromType);
+  const auto& toPrecisionScale = getDecimalPrecisionScale(*toType);
+
+  applyToSelectedNoThrowLocal(
+      context, rows, castResult, [&](vector_size_t row) {
+        auto rescaledValue = DecimalUtil::rescaleWithRoundUp<TInput, TOutput>(
+            sourceVector->valueAt(row),
+            fromPrecisionScale.first,
+            fromPrecisionScale.second,
+            toPrecisionScale.first,
+            toPrecisionScale.second);
+        if (rescaledValue.has_value()) {
+          castResultRawBuffer[row] = rescaledValue.value();
+        } else {
+          castResult->setNull(row, true);
+        }
+      });
+}
+
+template <typename TInput, typename TOutput>
+void CastExpr::applyIntToDecimalCastKernel(
+    const SelectivityVector& rows,
+    const BaseVector& input,
+    exec::EvalCtx& context,
+    const TypePtr& toType,
+    VectorPtr& castResult) {
+  auto sourceVector = input.as<SimpleVector<TInput>>();
+  auto castResultRawBuffer =
+      castResult->asUnchecked<FlatVector<TOutput>>()->mutableRawValues();
+  const auto& toPrecisionScale = getDecimalPrecisionScale(*toType);
+  applyToSelectedNoThrowLocal(
+      context, rows, castResult, [&](vector_size_t row) {
+        auto rescaledValue = DecimalUtil::rescaleInt<TInput, TOutput>(
+            sourceVector->valueAt(row),
+            toPrecisionScale.first,
+            toPrecisionScale.second);
+        if (rescaledValue.has_value()) {
+          castResultRawBuffer[row] = rescaledValue.value();
+        } else {
+          castResult->setNull(row, true);
+        }
+      });
+}
+
+template <typename TInput>
+VectorPtr CastExpr::applyDecimalToDoubleCast(
+    const SelectivityVector& rows,
+    const BaseVector& input,
+    exec::EvalCtx& context,
+    const TypePtr& fromType) {
+  VectorPtr result;
+  context.ensureWritable(rows, DOUBLE(), result);
+  (*result).clearNulls(rows);
+  auto resultBuffer =
+      result->asUnchecked<FlatVector<double>>()->mutableRawValues();
+  const auto precisionScale = getDecimalPrecisionScale(*fromType);
+  const auto simpleInput = input.as<SimpleVector<TInput>>();
+  applyToSelectedNoThrowLocal(context, rows, result, [&](int row) {
+    auto output = util::Converter<TypeKind::DOUBLE, void, false>::cast(
+        simpleInput->valueAt(row));
+    resultBuffer[row] =
+        output / DecimalUtil::kPowersOfTen[precisionScale.second];
+  });
+
+  return result;
+}
+
+template <TypeKind ToKind, TypeKind FromKind>
+void CastExpr::applyCastPrimitives(
+    const SelectivityVector& rows,
+    exec::EvalCtx& context,
+    const BaseVector& input,
+    VectorPtr& result) {
+  using To = typename TypeTraits<ToKind>::NativeType;
+  using From = typename TypeTraits<FromKind>::NativeType;
+  auto* resultFlatVector = result->as<FlatVector<To>>();
+  auto* inputSimpleVector = input.as<SimpleVector<From>>();
+
+  const auto& queryConfig = context.execCtx()->queryCtx()->queryConfig();
+  auto& resultType = resultFlatVector->type();
+
+  auto setError = [&](vector_size_t row, const std::string& details) {
+    if (setNullInResultAtError()) {
+      result->setNull(row, true);
+    } else {
+      context.setVeloxExceptionError(
+          row, makeBadCastException(resultType, input, row, details));
+    }
+  };
+
+  if (!queryConfig.isCastToIntByTruncate()) {
+    applyToSelectedNoThrowLocal(context, rows, result, [&](int row) {
+      try {
+        applyCastKernel<ToKind, FromKind, false /*truncate*/>(
+            row, context, inputSimpleVector, resultFlatVector);
+
+      } catch (const VeloxUserError& ue) {
+        setError(row, ue.message());
+      } catch (const std::exception& e) {
+        setError(row, e.what());
+      }
+    });
+
+  } else {
+    applyToSelectedNoThrowLocal(context, rows, result, [&](int row) {
+      try {
+        applyCastKernel<ToKind, FromKind, true /*truncate*/>(
+            row, context, inputSimpleVector, resultFlatVector);
+      } catch (const VeloxUserError& ue) {
+        setError(row, ue.message());
+      } catch (const std::exception& e) {
+        setError(row, e.what());
+      }
+    });
+  }
+
+  // If we're converting to a TIMESTAMP, check if we need to adjust the
+  // current GMT timezone to the user provided session timezone.
+  if constexpr (ToKind == TypeKind::TIMESTAMP) {
+    // If user explicitly asked us to adjust the timezone.
+    if (queryConfig.adjustTimestampToTimezone()) {
+      auto sessionTzName = queryConfig.sessionTimezone();
+      if (!sessionTzName.empty()) {
+        // locate_zone throws runtime_error if the timezone couldn't be found
+        // (so we're safe to dereference the pointer).
+        auto* timeZone = date::locate_zone(sessionTzName);
+        auto rawTimestamps = resultFlatVector->mutableRawValues();
+
+        applyToSelectedNoThrowLocal(context, rows, result, [&](int row) {
+          rawTimestamps[row].toGMT(*timeZone);
+        });
+      }
+    }
+  }
+}
+
+template <TypeKind ToKind>
+void CastExpr::applyCastPrimitivesDispatch(
+    const TypePtr& fromType,
+    const TypePtr& toType,
+    const SelectivityVector& rows,
+    exec::EvalCtx& context,
+    const BaseVector& input,
+    VectorPtr& result) {
+  context.ensureWritable(rows, toType, result);
+
+  // This already excludes complex types, hugeint and unknown from type kinds.
+  VELOX_DYNAMIC_SCALAR_TEMPLATE_TYPE_DISPATCH(
+      applyCastPrimitives,
+      ToKind,
+      fromType->kind() /*dispatched*/,
+      rows,
+      context,
+      input,
+      result);
+}
+
+} // namespace facebook::velox::exec
diff --git a/velox/expression/CastExpr.cpp b/velox/expression/CastExpr.cpp
index 2ff3ceb5b187..68ad19ab4206 100644
--- a/velox/expression/CastExpr.cpp
+++ b/velox/expression/CastExpr.cpp
@@ -16,15 +16,13 @@
 
 #include "velox/expression/CastExpr.h"
 
-#include <stdexcept>
-
 #include <fmt/format.h>
+#include <stdexcept>
 
-#include <velox/common/base/VeloxException.h>
 #include "velox/common/base/Exceptions.h"
 #include "velox/core/CoreTypeSystem.h"
 #include "velox/expression/PeeledEncoding.h"
-#include "velox/expression/StringWriter.h"
+#include "velox/expression/ScopedVarSetter.h"
 #include "velox/external/date/tz.h"
 #include "velox/functions/lib/RowsTranslationUtil.h"
 #include "velox/type/Type.h"
@@ -34,74 +32,7 @@
 
 namespace facebook::velox::exec {
 
-namespace {
-
-std::string makeErrorMessage(
-    const BaseVector& input,
-    vector_size_t row,
-    const TypePtr& toType,
-    const std::string& details = "") {
-  return fmt::format(
-      "Failed to cast from {} to {}: {}. {}",
-      input.type()->toString(),
-      toType->toString(),
-      input.toString(row),
-      details);
-}
-
-std::exception_ptr makeBadCastException(
-    const TypePtr& resultType,
-    const BaseVector& input,
-    vector_size_t row,
-    const std::string& errorDetails) {
-  return std::make_exception_ptr(VeloxUserError(
-      std::current_exception(),
-      makeErrorMessage(input, row, resultType, errorDetails),
-      false));
-};
-
-/// The per-row level Kernel
-/// @tparam ToKind The cast target type
-/// @tparam FromKind The expression type
-/// @param row The index of the current row
-/// @param input The input vector (of type FromKind)
-/// @param result The output vector (of type ToKind)
-template <TypeKind ToKind, TypeKind FromKind, bool Truncate>
-void applyCastKernel(
-    vector_size_t row,
-    EvalCtx& context,
-    const SimpleVector<typename TypeTraits<FromKind>::NativeType>* input,
-    FlatVector<typename TypeTraits<ToKind>::NativeType>* result) {
-  auto inputRowValue = input->valueAt(row);
-
-  // Optimize empty input strings casting by avoiding throwing exceptions.
-  if constexpr (
-      FromKind == TypeKind::VARCHAR || FromKind == TypeKind::VARBINARY) {
-    if constexpr (
-        TypeTraits<ToKind>::isPrimitiveType &&
-        TypeTraits<ToKind>::isFixedWidth) {
-      if (inputRowValue.size() == 0) {
-        context.setVeloxExceptionError(
-            row,
-            makeBadCastException(result->type(), *input, row, "Empty string"));
-        return;
-      }
-    }
-  }
-
-  auto output = util::Converter<ToKind, void, Truncate>::cast(inputRowValue);
-
-  if constexpr (ToKind == TypeKind::VARCHAR || ToKind == TypeKind::VARBINARY) {
-    // Write the result output to the output vector
-    auto writer = exec::StringWriter<>(result, row);
-    writer.copy_from(output);
-    writer.finalize();
-  } else {
-    result->set(row, output);
-  }
-}
-
-VectorPtr castFromDate(
+VectorPtr CastExpr::castFromDate(
     const SelectivityVector& rows,
     const BaseVector& input,
     exec::EvalCtx& context,
@@ -114,7 +45,7 @@ VectorPtr castFromDate(
   switch (toType->kind()) {
     case TypeKind::VARCHAR: {
       auto* resultFlatVector = castResult->as<FlatVector<StringView>>();
-      context.applyToSelectedNoThrow(rows, [&](int row) {
+      applyToSelectedNoThrowLocal(context, rows, castResult, [&](int row) {
         try {
           auto output = DATE()->toString(inputFlatVector->valueAt(row));
           auto writer = exec::StringWriter<>(resultFlatVector, row);
@@ -134,12 +65,13 @@ VectorPtr castFromDate(
     case TypeKind::TIMESTAMP: {
       static const int64_t kMillisPerDay{86'400'000};
       auto* resultFlatVector = castResult->as<FlatVector<Timestamp>>();
-      context.applyToSelectedNoThrow(rows, [&](int row) {
+      applyToSelectedNoThrowLocal(context, rows, castResult, [&](int row) {
         resultFlatVector->set(
             row,
             Timestamp::fromMillis(
                 inputFlatVector->valueAt(row) * kMillisPerDay));
       });
+
       return castResult;
     }
     default:
@@ -148,33 +80,7 @@ VectorPtr castFromDate(
   }
 }
 
-template <bool adjustForTimeZone>
-void castTimestampToDate(
-    const SelectivityVector& rows,
-    const BaseVector& input,
-    exec::EvalCtx& context,
-    FlatVector<int32_t>* resultFlatVector,
-    const date::time_zone* timeZone = nullptr) {
-  static const int32_t kSecsPerDay{86'400};
-  auto inputVector = input.as<SimpleVector<Timestamp>>();
-  context.applyToSelectedNoThrow(rows, [&](int row) {
-    auto input = inputVector->valueAt(row);
-    if constexpr (adjustForTimeZone) {
-      input.toTimezone(*timeZone);
-    }
-    auto seconds = input.getSeconds();
-    if (seconds >= 0 || seconds % kSecsPerDay == 0) {
-      resultFlatVector->set(row, seconds / kSecsPerDay);
-    } else {
-      // For division with negatives, minus 1 to compensate the discarded
-      // fractional part. e.g. -1/86'400 yields 0, yet it should be
-      // considered as -1 day.
-      resultFlatVector->set(row, seconds / kSecsPerDay - 1);
-    }
-  });
-}
-
-VectorPtr castToDate(
+VectorPtr CastExpr::castToDate(
     const SelectivityVector& rows,
     const BaseVector& input,
     exec::EvalCtx& context,
@@ -186,7 +92,7 @@ VectorPtr castToDate(
   switch (fromType->kind()) {
     case TypeKind::VARCHAR: {
       auto* inputVector = input.as<SimpleVector<StringView>>();
-      context.applyToSelectedNoThrow(rows, [&](int row) {
+      applyToSelectedNoThrowLocal(context, rows, castResult, [&](int row) {
         try {
           auto inputString = inputVector->valueAt(row);
           resultFlatVector->set(row, DATE()->toDays(inputString));
@@ -198,6 +104,7 @@ VectorPtr castToDate(
               makeErrorMessage(input, row, DATE()) + " " + e.what());
         }
       });
+
       return castResult;
     }
     case TypeKind::TIMESTAMP: {
@@ -205,10 +112,9 @@ VectorPtr castToDate(
       auto sessionTzName = queryConfig.sessionTimezone();
       if (queryConfig.adjustTimestampToTimezone() && !sessionTzName.empty()) {
         auto* timeZone = date::locate_zone(sessionTzName);
-        castTimestampToDate<true>(
-            rows, input, context, resultFlatVector, timeZone);
+        castTimestampToDate<true>(rows, input, context, castResult, timeZone);
       } else {
-        castTimestampToDate<false>(rows, input, context, resultFlatVector);
+        castTimestampToDate<false>(rows, input, context, castResult);
       }
       return castResult;
     }
@@ -218,169 +124,26 @@ VectorPtr castToDate(
   }
 }
 
-template <typename TInput, typename TOutput>
-void applyDecimalCastKernel(
-    const SelectivityVector& rows,
-    const BaseVector& input,
-    exec::EvalCtx& context,
-    const TypePtr& fromType,
-    const TypePtr& toType,
-    VectorPtr& castResult) {
-  auto sourceVector = input.as<SimpleVector<TInput>>();
-  auto castResultRawBuffer =
-      castResult->asUnchecked<FlatVector<TOutput>>()->mutableRawValues();
-  const auto& fromPrecisionScale = getDecimalPrecisionScale(*fromType);
-  const auto& toPrecisionScale = getDecimalPrecisionScale(*toType);
-  context.applyToSelectedNoThrow(rows, [&](vector_size_t row) {
-    auto rescaledValue = DecimalUtil::rescaleWithRoundUp<TInput, TOutput>(
-        sourceVector->valueAt(row),
-        fromPrecisionScale.first,
-        fromPrecisionScale.second,
-        toPrecisionScale.first,
-        toPrecisionScale.second);
-    if (rescaledValue.has_value()) {
-      castResultRawBuffer[row] = rescaledValue.value();
-    } else {
-      castResult->setNull(row, true);
-    }
-  });
-}
-
-template <typename TInput, typename TOutput>
-void applyIntToDecimalCastKernel(
-    const SelectivityVector& rows,
-    const BaseVector& input,
-    exec::EvalCtx& context,
-    const TypePtr& toType,
-    VectorPtr& castResult) {
-  auto sourceVector = input.as<SimpleVector<TInput>>();
-  auto castResultRawBuffer =
-      castResult->asUnchecked<FlatVector<TOutput>>()->mutableRawValues();
-  const auto& toPrecisionScale = getDecimalPrecisionScale(*toType);
-  context.applyToSelectedNoThrow(rows, [&](vector_size_t row) {
-    auto rescaledValue = DecimalUtil::rescaleInt<TInput, TOutput>(
-        sourceVector->valueAt(row),
-        toPrecisionScale.first,
-        toPrecisionScale.second);
-    if (rescaledValue.has_value()) {
-      castResultRawBuffer[row] = rescaledValue.value();
+namespace {
+void propagateErrorsOrSetNulls(
+    bool setNullInResultAtError,
+    EvalCtx& context,
+    const SelectivityVector& nestedRows,
+    const BufferPtr& elementToTopLevelRows,
+    VectorPtr& result,
+    ErrorVectorPtr& oldErrors) {
+  if (context.errors()) {
+    if (setNullInResultAtError) {
+      // Errors in context.errors() should be translated to nulls in the top
+      // level rows.
+      context.convertElementErrorsToTopLevelNulls(
+          nestedRows, elementToTopLevelRows, result);
     } else {
-      castResult->setNull(row, true);
-    }
-  });
-}
-
-template <typename TInput>
-VectorPtr applyDecimalToDoubleCast(
-    const SelectivityVector& rows,
-    const BaseVector& input,
-    exec::EvalCtx& context,
-    const TypePtr& fromType) {
-  VectorPtr result;
-  context.ensureWritable(rows, DOUBLE(), result);
-  (*result).clearNulls(rows);
-  auto resultBuffer =
-      result->asUnchecked<FlatVector<double>>()->mutableRawValues();
-  const auto precisionScale = getDecimalPrecisionScale(*fromType);
-  const auto simpleInput = input.as<SimpleVector<TInput>>();
-  context.applyToSelectedNoThrow(rows, [&](int row) {
-    auto output = util::Converter<TypeKind::DOUBLE, void, false>::cast(
-        simpleInput->valueAt(row));
-    resultBuffer[row] =
-        output / DecimalUtil::kPowersOfTen[precisionScale.second];
-  });
-  return result;
-}
-
-template <TypeKind ToKind, TypeKind FromKind>
-void applyCastPrimitives(
-    const SelectivityVector& rows,
-    exec::EvalCtx& context,
-    const BaseVector& input,
-    VectorPtr& result) {
-  using To = typename TypeTraits<ToKind>::NativeType;
-  using From = typename TypeTraits<FromKind>::NativeType;
-  auto* resultFlatVector = result->as<FlatVector<To>>();
-  auto* inputSimpleVector = input.as<SimpleVector<From>>();
-
-  const auto& queryConfig = context.execCtx()->queryCtx()->queryConfig();
-  auto& resultType = resultFlatVector->type();
-
-  auto setVeloxError = [&](vector_size_t row, const std::string& details) {
-    context.setVeloxExceptionError(
-        row, makeBadCastException(resultType, input, row, details));
-  };
-
-  auto setError = [&](vector_size_t row, const std::string& details) {
-    context.setError(
-        row, makeBadCastException(resultType, input, row, details));
-  };
-
-  if (!queryConfig.isCastToIntByTruncate()) {
-    context.applyToSelectedNoThrow(rows, [&](int row) {
-      try {
-        applyCastKernel<ToKind, FromKind, false /*truncate*/>(
-            row, context, inputSimpleVector, resultFlatVector);
-
-      } catch (const VeloxUserError& ue) {
-        setVeloxError(row, ue.message());
-      } catch (const std::exception& e) {
-        setError(row, e.what());
-      }
-    });
-  } else {
-    context.applyToSelectedNoThrow(rows, [&](int row) {
-      try {
-        applyCastKernel<ToKind, FromKind, true /*truncate*/>(
-            row, context, inputSimpleVector, resultFlatVector);
-      } catch (const VeloxUserError& ue) {
-        setVeloxError(row, ue.message());
-      } catch (const std::exception& e) {
-        setError(row, e.what());
-      }
-    });
-  }
-
-  // If we're converting to a TIMESTAMP, check if we need to adjust the
-  // current GMT timezone to the user provided session timezone.
-  if constexpr (ToKind == TypeKind::TIMESTAMP) {
-    // If user explicitly asked us to adjust the timezone.
-    if (queryConfig.adjustTimestampToTimezone()) {
-      auto sessionTzName = queryConfig.sessionTimezone();
-      if (!sessionTzName.empty()) {
-        // locate_zone throws runtime_error if the timezone couldn't be found
-        // (so we're safe to dereference the pointer).
-        auto* timeZone = date::locate_zone(sessionTzName);
-        auto rawTimestamps = resultFlatVector->mutableRawValues();
-
-        rows.applyToSelected(
-            [&](int row) { rawTimestamps[row].toGMT(*timeZone); });
-      }
+      context.addElementErrorsToTopLevel(
+          nestedRows, elementToTopLevelRows, oldErrors);
     }
   }
 }
-
-template <TypeKind ToKind>
-void applyCastPrimitivesDispatch(
-    const TypePtr& fromType,
-    const TypePtr& toType,
-    const SelectivityVector& rows,
-    exec::EvalCtx& context,
-    const BaseVector& input,
-    VectorPtr& result) {
-  context.ensureWritable(rows, toType, result);
-
-  // This already excludes complex types, hugeint and unknown from type kinds.
-  VELOX_DYNAMIC_SCALAR_TEMPLATE_TYPE_DISPATCH(
-      applyCastPrimitives,
-      ToKind,
-      fromType->kind() /*dispatched*/,
-      rows,
-      context,
-      input,
-      result);
-}
-
 } // namespace
 
 VectorPtr CastExpr::applyMap(
@@ -413,13 +176,16 @@ VectorPtr CastExpr::applyMap(
   if (fromType.keyType() == toType.keyType()) {
     newMapKeys = input->mapKeys();
   } else {
-    apply(
-        nestedRows,
-        mapKeys,
-        context,
-        fromType.keyType(),
-        toType.keyType(),
-        newMapKeys);
+    {
+      ScopedVarSetter holder(&inTopLevel, false);
+      apply(
+          nestedRows,
+          mapKeys,
+          context,
+          fromType.keyType(),
+          toType.keyType(),
+          newMapKeys);
+    }
   }
 
   // Cast values
@@ -427,19 +193,18 @@ VectorPtr CastExpr::applyMap(
   if (fromType.valueType() == toType.valueType()) {
     newMapValues = mapValues;
   } else {
-    apply(
-        nestedRows,
-        mapValues,
-        context,
-        fromType.valueType(),
-        toType.valueType(),
-        newMapValues);
+    {
+      ScopedVarSetter holder(&inTopLevel, false);
+      apply(
+          nestedRows,
+          mapValues,
+          context,
+          fromType.valueType(),
+          toType.valueType(),
+          newMapValues);
+    }
   }
 
-  context.addElementErrorsToTopLevel(
-      nestedRows, elementToTopLevelRows, oldErrors);
-  context.swapErrors(oldErrors);
-
   // Returned map vector should be addressable for every element, even those
   // that are not selected.
   BufferPtr sizes = input->sizes();
@@ -447,7 +212,6 @@ VectorPtr CastExpr::applyMap(
     // We extends size since that is cheap.
     newMapKeys->resize(input->mapKeys()->size());
     newMapValues->resize(input->mapValues()->size());
-
   } else if (
       newMapKeys->size() < input->mapKeys()->size() ||
       newMapValues->size() < input->mapValues()->size()) {
@@ -455,12 +219,13 @@ VectorPtr CastExpr::applyMap(
         AlignedBuffer::allocate<vector_size_t>(rows.end(), context.pool(), 0);
     auto* inputSizes = input->rawSizes();
     auto* rawSizes = sizes->asMutable<vector_size_t>();
+
     rows.applyToSelected(
         [&](vector_size_t row) { rawSizes[row] = inputSizes[row]; });
   }
 
   // Assemble the output map
-  return std::make_shared<MapVector>(
+  VectorPtr result = std::make_shared<MapVector>(
       context.pool(),
       MAP(toType.keyType(), toType.valueType()),
       input->nulls(),
@@ -469,6 +234,18 @@ VectorPtr CastExpr::applyMap(
       sizes,
       newMapKeys,
       newMapValues);
+
+  propagateErrorsOrSetNulls(
+      setNullInResultAtError(),
+      context,
+      nestedRows,
+      elementToTopLevelRows,
+      result,
+      oldErrors);
+
+  // Restore original state.
+  context.swapErrors(oldErrors);
+  return result;
 }
 
 VectorPtr CastExpr::applyArray(
@@ -490,19 +267,16 @@ VectorPtr CastExpr::applyArray(
   context.swapErrors(oldErrors);
 
   VectorPtr newElements;
-  apply(
-      nestedRows,
-      arrayElements,
-      context,
-      fromType.elementType(),
-      toType.elementType(),
-      newElements);
-
-  if (context.errors()) {
-    context.addElementErrorsToTopLevel(
-        nestedRows, elementToTopLevelRows, oldErrors);
+  {
+    ScopedVarSetter holder(&inTopLevel, false);
+    apply(
+        nestedRows,
+        arrayElements,
+        context,
+        fromType.elementType(),
+        toType.elementType(),
+        newElements);
   }
-  context.swapErrors(oldErrors);
 
   // Returned array vector should be addressable for every element, even those
   // that are not selected.
@@ -519,7 +293,7 @@ VectorPtr CastExpr::applyArray(
         [&](vector_size_t row) { rawSizes[row] = inputSizes[row]; });
   }
 
-  return std::make_shared<ArrayVector>(
+  VectorPtr result = std::make_shared<ArrayVector>(
       context.pool(),
       ARRAY(toType.elementType()),
       input->nulls(),
@@ -527,6 +301,17 @@ VectorPtr CastExpr::applyArray(
       input->offsets(),
       sizes,
       newElements);
+
+  propagateErrorsOrSetNulls(
+      setNullInResultAtError(),
+      context,
+      nestedRows,
+      elementToTopLevelRows,
+      result,
+      oldErrors);
+  // Restore original state.
+  context.swapErrors(oldErrors);
+  return result;
 }
 
 VectorPtr CastExpr::applyRow(
@@ -548,6 +333,13 @@ VectorPtr CastExpr::applyRow(
   std::vector<VectorPtr> newChildren;
   newChildren.reserve(numOutputChildren);
 
+  ErrorVectorPtr oldErrors;
+  if (setNullInResultAtError()) {
+    // We need to isolate errors that happen during the cast from previous
+    // errors since those translate to nulls, unlike exisiting errors.
+    context.swapErrors(oldErrors);
+  }
+
   for (auto toChildrenIndex = 0; toChildrenIndex < numOutputChildren;
        toChildrenIndex++) {
     // For each child, find the corresponding column index in the output
@@ -584,7 +376,8 @@ VectorPtr CastExpr::applyRow(
       if (toChildType == inputChild->type()) {
         outputChild = inputChild;
       } else {
-        // Apply cast for the child
+        // Apply cast for the child.
+        ScopedVarSetter holder(&inTopLevel, false);
         apply(
             rows,
             inputChild,
@@ -598,12 +391,27 @@ VectorPtr CastExpr::applyRow(
   }
 
   // Assemble the output row
-  return std::make_shared<RowVector>(
+  VectorPtr result = std::make_shared<RowVector>(
       context.pool(),
       toType,
       input->nulls(),
       rows.end(),
       std::move(newChildren));
+
+  if (setNullInResultAtError()) {
+    // Set errors as nulls.
+    if (auto errors = context.errors()) {
+      rows.applyToSelected([&](auto row) {
+        if (errors->isIndexInRange(row) && !errors->isNullAt(row)) {
+          result->setNull(row, true);
+        }
+      });
+    }
+    // Restore original state.
+    context.swapErrors(oldErrors);
+  }
+
+  return result;
 }
 
 template <typename toDecimalType>
@@ -797,14 +605,20 @@ void CastExpr::evalSpecialForm(
   auto fromType = inputs_[0]->type();
   auto toType = std::const_pointer_cast<const Type>(type_);
 
-  apply(rows, input, context, fromType, toType, result);
+  inTopLevel = true;
+  if (nullOnFailure()) {
+    ScopedVarSetter holder{context.mutableThrowOnError(), false};
+    apply(rows, input, context, fromType, toType, result);
+  } else {
+    apply(rows, input, context, fromType, toType, result);
+  }
   // Return 'input' back to the vector pool in 'context' so it can be reused.
   context.releaseVector(input);
 }
 
 std::string CastExpr::toString(bool recursive) const {
   std::stringstream out;
-  out << "cast(";
+  out << name() << "(";
   if (recursive) {
     appendInputs(out);
   } else {
@@ -816,7 +630,7 @@ std::string CastExpr::toString(bool recursive) const {
 
 std::string CastExpr::toSql(std::vector<VectorPtr>* complexConstants) const {
   std::stringstream out;
-  out << "cast(";
+  out << name() << "(";
   appendInputsSql(out, complexConstants);
   out << " as ";
   toTypeSql(type_, out);
@@ -839,6 +653,24 @@ ExprPtr CastCallToSpecialForm::constructSpecialForm(
       "CAST statements expect exactly 1 argument, received {}",
       compiledChildren.size());
   return std::make_shared<CastExpr>(
-      type, std::move(compiledChildren[0]), trackCpuUsage);
+      type, std::move(compiledChildren[0]), trackCpuUsage, false);
+}
+
+TypePtr TryCastCallToSpecialForm::resolveType(
+    const std::vector<TypePtr>& /* argTypes */) {
+  VELOX_FAIL("TRY CAST expressions do not support type resolution.");
+}
+
+ExprPtr TryCastCallToSpecialForm::constructSpecialForm(
+    const TypePtr& type,
+    std::vector<ExprPtr>&& compiledChildren,
+    bool trackCpuUsage) {
+  VELOX_CHECK_EQ(
+      compiledChildren.size(),
+      1,
+      "TRY CAST statements expect exactly 1 argument, received {}",
+      compiledChildren.size());
+  return std::make_shared<CastExpr>(
+      type, std::move(compiledChildren[0]), trackCpuUsage, true);
 }
 } // namespace facebook::velox::exec
diff --git a/velox/expression/CastExpr.h b/velox/expression/CastExpr.h
index 7e701f9c19ec..3be65997169a 100644
--- a/velox/expression/CastExpr.h
+++ b/velox/expression/CastExpr.h
@@ -22,6 +22,7 @@
 namespace facebook::velox::exec {
 
 constexpr folly::StringPiece kCast = "cast";
+constexpr folly::StringPiece kTryCast = "try_cast";
 
 /// Custom operator for casts from and to custom types.
 class CastOperator {
@@ -71,13 +72,14 @@ class CastExpr : public SpecialForm {
   /// @param type The target type of the cast expression
   /// @param expr The expression to cast
   /// @param trackCpuUsage Whether to track CPU usage
-  CastExpr(TypePtr type, ExprPtr&& expr, bool trackCpuUsage)
+  CastExpr(TypePtr type, ExprPtr&& expr, bool trackCpuUsage, bool nullOnFailure)
       : SpecialForm(
             type,
             std::vector<ExprPtr>({expr}),
-            kCast.data(),
+            nullOnFailure ? kTryCast.data() : kCast.data(),
             false /* supportsFlatNoNullsFastPath */,
-            trackCpuUsage) {
+            trackCpuUsage),
+        nullOnFailure_(nullOnFailure) {
     auto fromType = inputs_[0]->type();
     castFromOperator_ = getCustomTypeCastOperator(fromType->toString());
     if (castFromOperator_ && !castFromOperator_->isSupportedToType(type)) {
@@ -160,6 +162,94 @@ class CastExpr : public SpecialForm {
       const TypePtr& toType,
       VectorPtr& result);
 
+  template <typename Func>
+  void applyToSelectedNoThrowLocal(
+      EvalCtx& context,
+      const SelectivityVector& rows,
+      VectorPtr& result,
+      Func&& func);
+
+  /// The per-row level Kernel
+  /// @tparam ToKind The cast target type
+  /// @tparam FromKind The expression type
+  /// @param row The index of the current row
+  /// @param input The input vector (of type FromKind)
+  /// @param result The output vector (of type ToKind)
+  template <TypeKind ToKind, TypeKind FromKind, bool Truncate>
+  void applyCastKernel(
+      vector_size_t row,
+      EvalCtx& context,
+      const SimpleVector<typename TypeTraits<FromKind>::NativeType>* input,
+      FlatVector<typename TypeTraits<ToKind>::NativeType>* result);
+
+  VectorPtr castFromDate(
+      const SelectivityVector& rows,
+      const BaseVector& input,
+      exec::EvalCtx& context,
+      const TypePtr& toType);
+
+  VectorPtr castToDate(
+      const SelectivityVector& rows,
+      const BaseVector& input,
+      exec::EvalCtx& context,
+      const TypePtr& fromType);
+
+  template <typename TInput, typename TOutput>
+  void applyDecimalCastKernel(
+      const SelectivityVector& rows,
+      const BaseVector& input,
+      exec::EvalCtx& context,
+      const TypePtr& fromType,
+      const TypePtr& toType,
+      VectorPtr& castResult);
+
+  template <typename TInput, typename TOutput>
+  void applyIntToDecimalCastKernel(
+      const SelectivityVector& rows,
+      const BaseVector& input,
+      exec::EvalCtx& context,
+      const TypePtr& toType,
+      VectorPtr& castResult);
+
+  template <typename TInput>
+  VectorPtr applyDecimalToDoubleCast(
+      const SelectivityVector& rows,
+      const BaseVector& input,
+      exec::EvalCtx& context,
+      const TypePtr& fromType);
+
+  template <TypeKind ToKind, TypeKind FromKind>
+  void applyCastPrimitives(
+      const SelectivityVector& rows,
+      exec::EvalCtx& context,
+      const BaseVector& input,
+      VectorPtr& result);
+
+  template <TypeKind ToKind>
+  void applyCastPrimitivesDispatch(
+      const TypePtr& fromType,
+      const TypePtr& toType,
+      const SelectivityVector& rows,
+      exec::EvalCtx& context,
+      const BaseVector& input,
+      VectorPtr& result);
+
+  template <bool adjustForTimeZone>
+  void castTimestampToDate(
+      const SelectivityVector& rows,
+      const BaseVector& input,
+      exec::EvalCtx& context,
+      VectorPtr& result,
+      const date::time_zone* timeZone = nullptr);
+
+  bool nullOnFailure() const {
+    return nullOnFailure_;
+  }
+
+  bool setNullInResultAtError() const {
+    return nullOnFailure() && inTopLevel;
+  }
+
   // Custom cast operator for the from-type. Nullptr if the type is native or
   // doesn't support cast-from.
   CastOperatorPtr castFromOperator_;
@@ -167,6 +257,10 @@ class CastExpr : public SpecialForm {
   // Custom cast operator for the to-type. Nullptr if the type is native or
   // doesn't support cast-to.
   CastOperatorPtr castToOperator_;
+
+  bool nullOnFailure_;
+
+  bool inTopLevel = false;
 };
 
 class CastCallToSpecialForm : public FunctionCallToSpecialForm {
@@ -179,4 +273,15 @@ class CastCallToSpecialForm : public FunctionCallToSpecialForm {
       bool trackCpuUsage) override;
 };
 
+class TryCastCallToSpecialForm : public FunctionCallToSpecialForm {
+ public:
+  TypePtr resolveType(const std::vector<TypePtr>& argTypes) override;
+
+  ExprPtr constructSpecialForm(
+      const TypePtr& type,
+      std::vector<ExprPtr>&& compiledChildren,
+      bool trackCpuUsage) override;
+};
 } // namespace facebook::velox::exec
+
+#include "velox/expression/CastExpr-inl.h"
diff --git a/velox/expression/EvalCtx.cpp b/velox/expression/EvalCtx.cpp
index 12119256fa97..cefdf3f065e2 100644
--- a/velox/expression/EvalCtx.cpp
+++ b/velox/expression/EvalCtx.cpp
@@ -211,6 +211,23 @@ void EvalCtx::addElementErrorsToTopLevel(
   });
 }
 
+void EvalCtx::convertElementErrorsToTopLevelNulls(
+    const SelectivityVector& elementRows,
+    const BufferPtr& elementToTopLevelRows,
+    VectorPtr& result) {
+  if (!errors_) {
+    return;
+  }
+
+  const auto* rawElementToTopLevelRows =
+      elementToTopLevelRows->as<vector_size_t>();
+  elementRows.applyToSelected([&](auto row) {
+    if (errors_->isIndexInRange(row) && !errors_->isNullAt(row)) {
+      result->setNull(rawElementToTopLevelRows[row], true);
+    }
+  });
+}
+
 const VectorPtr& EvalCtx::getField(int32_t index) const {
   const VectorPtr* field;
   if (!peeledFields_.empty()) {
diff --git a/velox/expression/EvalCtx.h b/velox/expression/EvalCtx.h
index 3e82f4c97eaf..ce9506b1a6ca 100644
--- a/velox/expression/EvalCtx.h
+++ b/velox/expression/EvalCtx.h
@@ -134,6 +134,13 @@ class EvalCtx {
       const BufferPtr& elementToTopLevelRows,
       ErrorVectorPtr& topLevelErrors);
 
+  // Given a mapping from element rows to top-level rows, set errors in
+  // in the elements as nulls int the top level row.
+  void convertElementErrorsToTopLevelNulls(
+      const SelectivityVector& elementRows,
+      const BufferPtr& elementToTopLevelRows,
+      VectorPtr& result);
+
   void deselectErrors(SelectivityVector& rows) const {
     if (!errors_) {
       return;
diff --git a/velox/expression/ExprCompiler.cpp b/velox/expression/ExprCompiler.cpp
index 94ce09fe3f20..9a9cff4ae9fb 100644
--- a/velox/expression/ExprCompiler.cpp
+++ b/velox/expression/ExprCompiler.cpp
@@ -412,13 +412,11 @@ ExprPtr compileRewrittenExpression(
   } else if (auto cast = dynamic_cast<const core::CastTypedExpr*>(expr.get())) {
     VELOX_CHECK(!compiledInputs.empty());
     auto castExpr = std::make_shared<CastExpr>(
-        resultType, std::move(compiledInputs[0]), trackCpuUsage);
-    if (cast->nullOnFailure()) {
-      result =
-          getSpecialForm(config, "try", resultType, {castExpr}, trackCpuUsage);
-    } else {
-      result = castExpr;
-    }
+        resultType,
+        std::move(compiledInputs[0]),
+        trackCpuUsage,
+        cast->nullOnFailure());
+    result = castExpr;
   } else if (auto call = dynamic_cast<const core::CallTypedExpr*>(expr.get())) {
     if (auto specialForm = getSpecialForm(
             config,
diff --git a/velox/expression/FunctionCallToSpecialForm.cpp b/velox/expression/FunctionCallToSpecialForm.cpp
index 46855c30d857..adf576a18cb5 100644
--- a/velox/expression/FunctionCallToSpecialForm.cpp
+++ b/velox/expression/FunctionCallToSpecialForm.cpp
@@ -32,6 +32,7 @@ RegistryType makeRegistry() {
   registry.emplace(
       "and", std::make_unique<ConjunctCallToSpecialForm>(true /* isAnd */));
   registry.emplace("cast", std::make_unique<CastCallToSpecialForm>());
+  registry.emplace("try_cast", std::make_unique<TryCastCallToSpecialForm>());
   registry.emplace("coalesce", std::make_unique<CoalesceCallToSpecialForm>());
   registry.emplace("if", std::make_unique<IfCallToSpecialForm>());
   registry.emplace(
diff --git a/velox/expression/tests/CastExprTest.cpp b/velox/expression/tests/CastExprTest.cpp
index 6bd5bcd57d0f..259db9cfe44a 100644
--- a/velox/expression/tests/CastExprTest.cpp
+++ b/velox/expression/tests/CastExprTest.cpp
@@ -27,8 +27,8 @@
 #include "velox/vector/TypeAliases.h"
 
 using namespace facebook::velox;
-using namespace facebook::velox::test;
-
+namespace facebook::velox::test {
+namespace {
 class CastExprTest : public functions::test::CastBaseTest {
  protected:
   CastExprTest() {
@@ -120,7 +120,7 @@ class CastExprTest : public functions::test::CastBaseTest {
       exec::EvalCtx evalCtx(&execCtx_, &dictionaryExprSet, rowVector.get());
       dictionaryExprSet.eval(rows, evalCtx, result);
 
-      auto indices = ::makeIndicesInReverse(size, pool());
+      auto indices = functions::test::makeIndicesInReverse(size, pool());
       assertEqualVectors(wrapInDictionary(indices, size, expected), result[0]);
     }
   }
@@ -691,6 +691,28 @@ TEST_F(CastExprTest, mapCast) {
       VELOX_CHECK(start + size - 1 < valuesSize);
     }
   }
+
+  // Error handling.
+  {
+    auto data = makeRowVector(
+        {makeMapVector<StringView, StringView>({{{"1", "2"}}, {{"", "1"}}})});
+    auto result1 = evaluate("try_cast(c0 as map(int, int))", data);
+    auto result2 = evaluate("try(cast(c0 as map(int, int)))", data);
+    ASSERT_FALSE(result1->isNullAt(0));
+    ASSERT_TRUE(result1->isNullAt(1));
+
+    ASSERT_FALSE(result2->isNullAt(0));
+    ASSERT_TRUE(result2->isNullAt(1));
+    ASSERT_THROW(evaluate("cast(c0 as map(int, int)", data), VeloxException);
+  }
+
+  {
+    auto result = evaluate(
+        "try_cast(map(array_constructor('1'), array_constructor(''))  as map(int, int))",
+        makeRowVector({makeFlatVector<int32_t>({1, 2})}));
+    ASSERT_TRUE(result->isNullAt(0));
+    ASSERT_TRUE(result->isNullAt(1));
+  }
 }
 
 TEST_F(CastExprTest, arrayCast) {
@@ -751,6 +773,34 @@ TEST_F(CastExprTest, arrayCast) {
       VELOX_CHECK(start + size - 1 < elementsSize);
     }
   }
+
+  // Error handling.
+  {
+    auto data =
+        makeRowVector({makeArrayVector<StringView>({{"1", "2"}, {"", "1"}})});
+    auto result1 = evaluate("try_cast(c0 as bigint[])", data);
+    auto result2 = evaluate("try(cast(c0 as bigint[]))", data);
+
+    auto expected = makeNullableArrayVector<int64_t>({{{1, 2}}, std::nullopt});
+
+    assertEqualVectors(result1, expected);
+    assertEqualVectors(result2, expected);
+
+    ASSERT_THROW(evaluate("cast(c0 as bigint[])", data), VeloxException);
+  }
+
+  {
+    auto data = makeNullableNestedArrayVector<StringView>({
+        {{{{"1"_sv, "2"_sv}}, {{""_sv}}}}, // row0
+        {{{{std::nullopt, "4"_sv}}}}, // row1
+    });
+    auto expected = makeNullableNestedArrayVector<int64_t>({
+        std::nullopt, // row0
+        {{{{std::nullopt, 4}}}}, // row1
+
+    });
+    testComplexCast("c0", data, expected, true);
+  }
 }
 
 TEST_F(CastExprTest, rowCast) {
@@ -801,6 +851,58 @@ TEST_F(CastExprTest, rowCast) {
         {"c0", "b"}, {doubleVectorNullEvery11, intVectorNullAll}, nullEvery(5));
     testComplexCast("c0", rowVector, expectedRowVector);
   }
+
+  // Error handling.
+  {
+    auto data = makeRowVector(
+        {makeFlatVector<StringView>({"1", ""}),
+         makeFlatVector<StringView>({"2", "3"})});
+
+    auto expected = makeRowVector(
+        {makeFlatVector<int32_t>({1, 2}), makeFlatVector<int32_t>({2, 3})});
+    expected->setNull(1, true);
+
+    testComplexCast("c0", data, expected, true);
+  }
+
+  {
+    auto data = makeRowVector(
+        {makeArrayVector<StringView>({{"1", ""}, {"3", "4"}}),
+         makeFlatVector<StringView>({"2", ""})});
+
+    // expected1 is [null, struct{[3,4], ""}]
+    auto expected1 = makeRowVector(
+        {makeArrayVector<int32_t>({{1 /*will be null*/}, {3, 4}}),
+         makeFlatVector<StringView>({"2" /*will be null*/, ""})});
+    expected1->setNull(0, true);
+
+    // expected2 is [struct{["1",""], 2}, null]
+    auto expected2 = makeRowVector(
+        {makeArrayVector<StringView>({{"1", ""}, {"3", "4"}}),
+         makeFlatVector<int32_t>({2, 0 /*null*/})});
+    expected2->setNull(1, true);
+
+    // expected3 is [null, null]
+    auto expected3 = makeRowVector(
+        {makeArrayVector<int32_t>({{1}}), makeFlatVector<int32_t>(1)});
+    expected3->resize(2);
+    expected3->setNull(0, true);
+    expected3->setNull(1, true);
+
+    testComplexCast("c0", data, expected1, true);
+    testComplexCast("c0", data, expected2, true);
+    testComplexCast("c0", data, expected3, true);
+  }
+
+  // Null handling for nested structs.
+  {
+    auto data =
+        makeRowVector({makeRowVector({makeFlatVector<StringView>({"1", ""})})});
+    auto expected =
+        makeRowVector({makeRowVector({makeFlatVector<int32_t>({1, 0})})});
+    expected->setNull(1, true);
+    testComplexCast("c0", data, expected, true);
+  }
 }
 
 TEST_F(CastExprTest, nulls) {
@@ -1135,7 +1237,7 @@ class TestingDictionaryOverConstFunction : public exec::VectorFunction {
     const auto size = rows.size();
     auto constant = BaseVector::wrapInConstant(size, 0, args[0]);
 
-    auto indices = makeIndicesInReverse(size, context.pool());
+    auto indices = functions::test::makeIndicesInReverse(size, context.pool());
     auto nulls = allocateNulls(size, context.pool());
     result =
         BaseVector::wrapInDictionary(nulls, indices, size, std::move(constant));
@@ -1168,7 +1270,8 @@ TEST_F(CastExprTest, dictionaryOverConst) {
 }
 
 namespace {
-// Wrap input in a dictionary that point to subset of rows of the inner vector.
+// Wrap input in a dictionary that point to subset of rows of the inner
+// vector.
 class TestingDictionaryToFewerRowsFunction : public exec::VectorFunction {
  public:
   TestingDictionaryToFewerRowsFunction() {}
@@ -1206,16 +1309,16 @@ TEST_F(CastExprTest, dictionaryEncodedNestedInput) {
   // Cast ARRAY<ROW<BIGINT>> to ARRAY<ROW<VARCHAR>> where the outermost ARRAY
   // layer and innermost BIGINT layer are dictionary-encoded. This test case
   // ensures that when casting the ROW<BIGINT> vector, the result ROW vector
-  // would not be longer than the result VARCHAR vector. In the test below, the
-  // ARRAY vector has 2 rows, each containing 3 elements. The ARRAY vector is
-  // wrapped in a dictionary layer that only references its first row, hence
-  // only the first 3 out of 6 rows are evaluated for the ROW and BIGINT vector.
-  // The BIGINT vector is also dictionary-encoded, so CastExpr produces a result
-  // VARCHAR vector of length 3. If the casting of the ROW vector produces a
-  // result ROW<VARCHAR> vector of the length of all rows, i.e., 6, the
-  // subsequent call to Expr::addNull() would throw due to the attempt of
-  // accessing the element VARCHAR vector at indices corresonding to the
-  // non-existent ROW at indices 3--5.
+  // would not be longer than the result VARCHAR vector. In the test below,
+  // the ARRAY vector has 2 rows, each containing 3 elements. The ARRAY vector
+  // is wrapped in a dictionary layer that only references its first row,
+  // hence only the first 3 out of 6 rows are evaluated for the ROW and BIGINT
+  // vector. The BIGINT vector is also dictionary-encoded, so CastExpr
+  // produces a result VARCHAR vector of length 3. If the casting of the ROW
+  // vector produces a result ROW<VARCHAR> vector of the length of all rows,
+  // i.e., 6, the subsequent call to Expr::addNull() would throw due to the
+  // attempt of accessing the element VARCHAR vector at indices corresonding
+  // to the non-existent ROW at indices 3--5.
   exec::registerVectorFunction(
       "add_dict",
       TestingDictionaryToFewerRowsFunction::signatures(),
@@ -1256,3 +1359,64 @@ TEST_F(CastExprTest, smallerNonNullRowsSizeThanRows) {
   auto expected = makeNullableFlatVector<double>({4, 6, 7, std::nullopt});
   assertEqualVectors(expected, result);
 }
+
+TEST_F(CastExprTest, tryCastDoesNotHideInputsAndExistingErrors) {
+  auto test = [&](const std::string& castExprThatThrow,
+                  const std::string& type,
+                  const auto& data) {
+    ASSERT_THROW(
+        auto result = evaluate(
+            fmt::format("try_cast({} as {})", castExprThatThrow, type), data),
+        VeloxException);
+
+    ASSERT_NO_THROW(evaluate(
+        fmt::format("try (cast ({} as {}))", castExprThatThrow, type), data));
+    ASSERT_NO_THROW(evaluate(fmt::format("try_{}", castExprThatThrow), data));
+    ASSERT_NO_THROW(evaluate(fmt::format("try ({})", castExprThatThrow), data));
+  };
+
+  {
+    auto data = makeRowVector({makeFlatVector<int64_t>({1, 2, 3, 4})});
+    test("cast('' as int)", "int", data);
+  }
+
+  {
+    auto data =
+        makeRowVector({makeArrayVector<StringView>({{"1", "", "3", "4"}})});
+    test("cast(c0 as integer[])", "integer[]", data);
+    test("cast(map(c0, c0) as map(int, int))", "map(int, int)", data);
+    test(
+        "cast(row_constructor(c0, c0, c0) as struct(a int[], b bigint[], c float[]))",
+        "struct(a int[], b bigint[], c float[])",
+        data);
+  }
+
+  {
+    auto data = makeRowVector(
+        {makeFlatVector<bool>({true, false, true, false}),
+         makeFlatVector<StringView>({{"1", "2", "3", "4"}})});
+
+    ASSERT_THROW(
+        evaluate("switch(c0, cast('' as int), cast(c1 as integer))", data),
+        VeloxException);
+
+    ASSERT_THROW(
+        evaluate("switch(c0, cast('' as int), try_cast(c1 as integer))", data),
+        VeloxException);
+    {
+      auto result = evaluate(
+          "try(switch(c0, cast('' as int), cast(c1 as integer)))", data);
+      ASSERT_TRUE(result->isNullAt(0));
+      ASSERT_TRUE(result->isNullAt(2));
+    }
+
+    {
+      auto result = evaluate(
+          "try(switch(c0, try_cast('' as int), cast(c1 as integer)))", data);
+      ASSERT_TRUE(result->isNullAt(0));
+      ASSERT_TRUE(result->isNullAt(2));
+    }
+  }
+}
+} // namespace
+} // namespace facebook::velox::test
diff --git a/velox/expression/tests/ExpressionVerifier.cpp b/velox/expression/tests/ExpressionVerifier.cpp
index 7adf29b8d5c3..ab588381ae7d 100644
--- a/velox/expression/tests/ExpressionVerifier.cpp
+++ b/velox/expression/tests/ExpressionVerifier.cpp
@@ -19,6 +19,7 @@
 #include "velox/expression/Expr.h"
 #include "velox/vector/VectorSaver.h"
 #include "velox/vector/tests/utils/VectorMaker.h"
+#include "velox/vector/tests/utils/VectorTestBase.h"
 
 namespace facebook::velox::test {
 
@@ -41,6 +42,15 @@ void logRowVector(const RowVectorPtr& rowVector) {
     }
   }
 }
+namespace {
+auto createCopy(const VectorPtr& input) {
+  VectorPtr result;
+  SelectivityVector rows(input->size());
+  BaseVector::ensureWritable(rows, input->type(), input->pool(), result);
+  result->copy(input.get(), rows, nullptr);
+  return result;
+}
+} // namespace
 
 void compareVectors(
     const VectorPtr& left,
@@ -147,9 +157,13 @@ ResultOrError ExpressionVerifier::verify(
       LOG(INFO) << "Modified inputs for common eval path: ";
       logRowVector(inputRowVector);
     }
-    exec::EvalCtx evalCtxCommon(execCtx_, &exprSetCommon, inputRowVector.get());
 
+    auto copy = createCopy(inputRowVector);
+
+    exec::EvalCtx evalCtxCommon(execCtx_, &exprSetCommon, inputRowVector.get());
     exprSetCommon.eval(rows, evalCtxCommon, commonEvalResult);
+    assertEqualVectors(copy, inputRowVector);
+
   } catch (const VeloxUserError&) {
     if (!canThrow) {
       LOG(ERROR)
@@ -175,7 +189,10 @@ ResultOrError ExpressionVerifier::verify(
     exec::EvalCtx evalCtxSimplified(
         execCtx_, &exprSetSimplified, rowVector.get());
 
+    auto copy = createCopy(rowVector);
     exprSetSimplified.eval(rows, evalCtxSimplified, simplifiedEvalResult);
+    assertEqualVectors(copy, rowVector);
+
   } catch (const VeloxUserError&) {
     exceptionSimplifiedPtr = std::current_exception();
   } catch (...) {
diff --git a/velox/expression/tests/FacebookPrestoExpressionFuzzerTest.cpp b/velox/expression/tests/FacebookPrestoExpressionFuzzerTest.cpp
index 9c799aaca05f..fe01163a8597 100644
--- a/velox/expression/tests/FacebookPrestoExpressionFuzzerTest.cpp
+++ b/velox/expression/tests/FacebookPrestoExpressionFuzzerTest.cpp
@@ -25,9 +25,14 @@
 #include "velox/functions/facebook/prestosql/Register.h"
 #include "velox/functions/prestosql/registration/RegistrationFunctions.h"
 
+DEFINE_bool(
+    include_fb_only,
+    true,
+    "If true fb only functions are included in the test");
+
 DEFINE_int64(
     seed,
-    123456,
+    0,
     "Initial seed for random number generator "
     "(use it to reproduce previous results).");
 
@@ -44,22 +49,14 @@ DEFINE_string(
     "Comma-separated list of special forms to use in generated expression. "
     "Supported special forms: and, or, coalesce, if, switch, cast.");
 
-int main(int argc, char** argv) {
-  facebook::velox::functions::prestosql::registerAllScalarFacebookOnlyFunctions(
-      "");
-  facebook::velox::functions::prestosql::registerAllScalarFunctions();
-
-  ::testing::InitGoogleTest(&argc, argv);
-
-  // Calls common init functions in the necessary order, initializing
-  // singletons, installing proper signal handlers for better debugging
-  // experience, and initialize glog and gflags.
-  folly::init(&argc, &argv);
-
-  // The following list are the Spark UDFs that hit issues
-  // For rlike you need the following combo in the only list:
-  // rlike, md5 and upper
+class FacebookPrestoExpressionFuzzerTest : public testing::Test {};
 
+TEST_F(FacebookPrestoExpressionFuzzerTest, test) {
+  if (FLAGS_include_fb_only) {
+    facebook::velox::functions::prestosql::
+        registerAllScalarFacebookOnlyFunctions("");
+  }
+  facebook::velox::functions::prestosql::registerAllScalarFunctions();
   // TODO: List of the functions that at some point crash or fail and need to
   // be fixed before we can enable.
   std::unordered_set<std::string> skipFunctions = {
@@ -76,7 +73,7 @@ int main(int argc, char** argv) {
       // https://github.com/facebookincubator/velox/issues/5398
       "concat",
   };
-
-  return FuzzerRunner::run(
-      FLAGS_only, FLAGS_seed, skipFunctions, FLAGS_special_forms);
+  size_t initialSeed = FLAGS_seed == 0 ? std::time(nullptr) : FLAGS_seed;
+  return FuzzerRunner::runFromGtest(
+      FLAGS_only, initialSeed, skipFunctions, FLAGS_special_forms);
 }
diff --git a/velox/expression/tests/FuzzerRunner.h b/velox/expression/tests/FuzzerRunner.h
index 3c4c0ecd7c39..5d5e5bf65daa 100644
--- a/velox/expression/tests/FuzzerRunner.h
+++ b/velox/expression/tests/FuzzerRunner.h
@@ -135,11 +135,18 @@ class FuzzerRunner {
       size_t seed,
       const std::unordered_set<std::string>& skipFunctions,
       const std::string& specialForms) {
+    runFromGtest(onlyFunctions, seed, skipFunctions, specialForms);
+    return RUN_ALL_TESTS();
+  }
+
+  static void runFromGtest(
+      const std::string& onlyFunctions,
+      size_t seed,
+      const std::unordered_set<std::string>& skipFunctions,
+      const std::string& specialForms) {
     auto signatures = facebook::velox::getFunctionSignatures();
     appendSpecialForms(specialForms, signatures);
     facebook::velox::test::expressionFuzzer(
         filterSignatures(signatures, onlyFunctions, skipFunctions), seed);
-    // Calling gtest here so that it can be recognized as tests in CI systems.
-    return RUN_ALL_TESTS();
   }
 };
diff --git a/velox/functions/lib/RowsTranslationUtil.h b/velox/functions/lib/RowsTranslationUtil.h
index 93f9b56595fc..bd374bb9529b 100644
--- a/velox/functions/lib/RowsTranslationUtil.h
+++ b/velox/functions/lib/RowsTranslationUtil.h
@@ -13,6 +13,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+#pragma once
+
 #include "velox/common/base/Nulls.h"
 #include "velox/vector/BaseVector.h"
 #include "velox/vector/SelectivityVector.h"
diff --git a/velox/functions/prestosql/ArrayFunctions.h b/velox/functions/prestosql/ArrayFunctions.h
index 6d00dcb57315..1b8030fb2fc9 100644
--- a/velox/functions/prestosql/ArrayFunctions.h
+++ b/velox/functions/prestosql/ArrayFunctions.h
@@ -697,35 +697,4 @@ struct ArrayUnionFunction {
   }
 };
 
-template <typename T>
-struct ArrayUnionFunctionString {
-  VELOX_DEFINE_FUNCTION_TYPES(T);
-
-  static constexpr int32_t reuse_strings_from_arg = 0;
-
-  // String version that avoids copy of strings.
-  FOLLY_ALWAYS_INLINE void call(
-      out_type<Array<Varchar>>& out,
-      const arg_type<Array<Varchar>>& inputArray1,
-      const arg_type<Array<Varchar>>& inputArray2) {
-    folly::F14FastSet<StringView> elementSet;
-    bool nullAdded = false;
-    auto addItems = [&](auto& inputArray) {
-      for (const auto& item : inputArray) {
-        if (item.has_value()) {
-          if (elementSet.insert(item.value()).second) {
-            auto& newItem = out.add_item();
-            newItem.setNoCopy(item.value());
-          }
-        } else if (!nullAdded) {
-          nullAdded = true;
-          out.add_null();
-        }
-      }
-    };
-    addItems(inputArray1);
-    addItems(inputArray2);
-  }
-};
-
 } // namespace facebook::velox::functions
diff --git a/velox/functions/prestosql/InPredicate.cpp b/velox/functions/prestosql/InPredicate.cpp
index 2383cb8cd8d2..d979fbd49e9d 100644
--- a/velox/functions/prestosql/InPredicate.cpp
+++ b/velox/functions/prestosql/InPredicate.cpp
@@ -86,6 +86,43 @@ std::pair<std::unique_ptr<common::Filter>, bool> createBigintValuesFilter(
   return {common::createBigintValues(values, nullAllowed), false};
 }
 
+// Cast double to Int64 and reuse Int64 filters
+template <typename T>
+std::pair<std::unique_ptr<common::Filter>, bool>
+createFloatingPointValuesFilter(
+    const std::vector<exec::VectorFunctionArg>& inputArgs) {
+  auto valuesPair = toValues<double, T>(inputArgs);
+  if (!valuesPair.has_value()) {
+    return {nullptr, false};
+  }
+
+  auto& values = valuesPair.value().first;
+  bool nullAllowed = valuesPair.value().second;
+
+  if (values.empty() && nullAllowed) {
+    return {nullptr, true};
+  }
+  VELOX_USER_CHECK(
+      !values.empty(),
+      "IN predicate expects at least one non-null value in the in-list");
+
+  if (values.size() == 1) {
+    return {
+        std::make_unique<common::FloatingPointRange<double>>(
+            values[0], false, false, values[0], false, false, nullAllowed),
+        false};
+  }
+
+  std::vector<int64_t> intValues(values.size());
+  for (size_t i = 0; i < values.size(); ++i) {
+    if (values[i] == double{}) {
+      values[i] = 0;
+    }
+    intValues[i] = reinterpret_cast<const int64_t&>(values[i]);
+  }
+  return {common::createBigintValues(intValues, nullAllowed), false};
+}
+
 // See createBigintValuesFilter.
 std::pair<std::unique_ptr<common::Filter>, bool> createBytesValuesFilter(
     const std::vector<exec::VectorFunctionArg>& inputArgs) {
@@ -140,6 +177,9 @@ class InPredicate : public exec::VectorFunction {
       case TypeKind::TINYINT:
         filter = createBigintValuesFilter<int8_t>(inputArgs);
         break;
+      case TypeKind::DOUBLE:
+        filter = createFloatingPointValuesFilter<double>(inputArgs);
+        break;
       case TypeKind::BOOLEAN:
         // Hack: using BIGINT filter for bool, which is essentially "int1_t".
         filter = createBigintValuesFilter<bool>(inputArgs);
@@ -194,6 +234,19 @@ class InPredicate : public exec::VectorFunction {
           return filter_->testInt64(value);
         });
         break;
+      case TypeKind::DOUBLE:
+        applyTyped<double>(rows, input, context, result, [&](double value) {
+          auto* derived =
+              dynamic_cast<common::FloatingPointRange<double>*>(filter_.get());
+          if (derived) {
+            return filter_->testDouble(value);
+          }
+          if (value == double{}) {
+            value = 0;
+          }
+          return filter_->testInt64(reinterpret_cast<const int64_t&>(value));
+        });
+        break;
       case TypeKind::BOOLEAN:
         applyTyped<bool>(rows, input, context, result, [&](bool value) {
           return filter_->testInt64(value);
@@ -224,6 +277,7 @@ class InPredicate : public exec::VectorFunction {
           "bigint",
           "varchar",
           "varbinary",
+          "double",
           "date"}) {
       signatures.emplace_back(exec::FunctionSignatureBuilder()
                                   .returnType("boolean")
diff --git a/velox/functions/prestosql/Probability.h b/velox/functions/prestosql/Probability.h
index 3d58a7683900..b500b8cb904d 100644
--- a/velox/functions/prestosql/Probability.h
+++ b/velox/functions/prestosql/Probability.h
@@ -18,6 +18,8 @@
 #include "boost/math/distributions/beta.hpp"
 #include "boost/math/distributions/binomial.hpp"
 #include "boost/math/distributions/cauchy.hpp"
+#include "boost/math/distributions/chi_squared.hpp"
+#include "boost/math/distributions/fisher_f.hpp"
 #include "velox/common/base/Exceptions.h"
 #include "velox/functions/Macros.h"
 
@@ -136,5 +138,33 @@ struct InverseBetaCDFFunction {
   }
 };
 
+template <typename T>
+struct ChiSquaredCDFFunction {
+  VELOX_DEFINE_FUNCTION_TYPES(T);
+
+  FOLLY_ALWAYS_INLINE void call(double& result, double df, double value) {
+    VELOX_USER_CHECK_GT(df, 0, "df must be greater than 0");
+    VELOX_USER_CHECK_GE(value, 0, "value must non-negative");
+
+    boost::math::chi_squared_distribution<> dist(df);
+    result = boost::math::cdf(dist, value);
+  }
+};
+
+template <typename T>
+struct FCDFFunction {
+  VELOX_DEFINE_FUNCTION_TYPES(T);
+
+  FOLLY_ALWAYS_INLINE void
+  call(double& result, double df1, double df2, double value) {
+    VELOX_USER_CHECK_GE(value, 0, "value must non-negative");
+    VELOX_USER_CHECK_GT(df1, 0, "numerator df must be greater than 0");
+    VELOX_USER_CHECK_GT(df2, 0, "denominator df must be greater than 0");
+
+    boost::math::fisher_f_distribution<> dist(df1, df2);
+    result = boost::math::cdf(dist, value);
+  }
+};
+
 } // namespace
 } // namespace facebook::velox::functions
diff --git a/velox/functions/prestosql/SIMDJsonFunctions.h b/velox/functions/prestosql/SIMDJsonFunctions.h
index e39d6ed5c13c..f6e727e4e147 100644
--- a/velox/functions/prestosql/SIMDJsonFunctions.h
+++ b/velox/functions/prestosql/SIMDJsonFunctions.h
@@ -205,4 +205,119 @@ struct SIMDJsonExtractScalarFunction {
   }
 };
 
+template <typename T>
+struct SIMDJsonExtractFunction {
+  VELOX_DEFINE_FUNCTION_TYPES(T);
+
+  bool call(
+      out_type<Json>& result,
+      const arg_type<Json>& json,
+      const arg_type<Varchar>& jsonPath) {
+    static constexpr std::string_view kNullString{"null"};
+    std::string results;
+    size_t resultSize = 0;
+    auto consumer = [&results, &resultSize](auto& v) {
+      // Add the separator for the JSON array.
+      if (resultSize++ > 0) {
+        results += ",";
+      }
+      // We could just convert v to a string using to_json_string directly, but
+      // in that case the JSON wouldn't be parsed (it would just return the
+      // contents directly) and we might miss invalid JSON.
+      switch (v.type()) {
+        case simdjson::ondemand::json_type::object:
+          results += simdjson::to_json_string(v.get_object()).value();
+          break;
+        case simdjson::ondemand::json_type::array:
+          results += simdjson::to_json_string(v.get_array()).value();
+          break;
+        case simdjson::ondemand::json_type::string:
+        case simdjson::ondemand::json_type::number:
+        case simdjson::ondemand::json_type::boolean:
+          results += simdjson::to_json_string(v).value();
+          break;
+        case simdjson::ondemand::json_type::null:
+          results += kNullString;
+          break;
+      }
+    };
+
+    if (!simdJsonExtract(json, jsonPath, consumer)) {
+      // If there's an error parsing the JSON, return null.
+      return false;
+    }
+
+    if (resultSize == 0) {
+      // If the path didn't map to anything in the JSON object, return null.
+      return false;
+    }
+
+    if (resultSize == 1) {
+      if (results == kNullString) {
+        // If there was only one value mapped to by the path and it was null,
+        // return null directly.
+        return false;
+      }
+
+      // If there was only one value mapped to by the path, don't wrap it in an
+      // array.
+      result.copy_from(results);
+    } else {
+      // Add the square brackets to make it a valid JSON array.
+      result.copy_from("[" + results + "]");
+    }
+    return true;
+  }
+};
+
+template <typename T>
+struct SIMDJsonSizeFunction {
+  VELOX_DEFINE_FUNCTION_TYPES(T);
+
+  FOLLY_ALWAYS_INLINE bool call(
+      int64_t& result,
+      const arg_type<Json>& json,
+      const arg_type<Varchar>& jsonPath) {
+    size_t resultCount = 0;
+    size_t singleResultSize = 0;
+    auto consumer = [&resultCount, &singleResultSize](auto& v) {
+      resultCount++;
+
+      if (resultCount == 1) {
+        // We only need the size of the actual object if there's only one
+        // returned, if multiple are returned we use the number of objects
+        // returned instead.
+        switch (v.type()) {
+          case simdjson::ondemand::json_type::object:
+            singleResultSize = v.count_fields().value();
+            break;
+          case simdjson::ondemand::json_type::array:
+            singleResultSize = v.count_elements().value();
+            break;
+          case simdjson::ondemand::json_type::string:
+          case simdjson::ondemand::json_type::number:
+          case simdjson::ondemand::json_type::boolean:
+          case simdjson::ondemand::json_type::null:
+            singleResultSize = 0;
+            break;
+        }
+      }
+    };
+
+    if (!simdJsonExtract(json, jsonPath, consumer)) {
+      // If there's an error parsing the JSON, return null.
+      return false;
+    }
+
+    if (resultCount == 0) {
+      // If the path didn't map to anything in the JSON object, return null.
+      return false;
+    }
+
+    result = resultCount == 1 ? singleResultSize : resultCount;
+
+    return true;
+  }
+};
+
 } // namespace facebook::velox::functions
diff --git a/velox/functions/prestosql/aggregates/MinMaxAggregates.cpp b/velox/functions/prestosql/aggregates/MinMaxAggregates.cpp
index b9ee996e22be..da44590fda6c 100644
--- a/velox/functions/prestosql/aggregates/MinMaxAggregates.cpp
+++ b/velox/functions/prestosql/aggregates/MinMaxAggregates.cpp
@@ -180,6 +180,31 @@ class MinAggregate : public MinMaxAggregate<T> {
     }
   }
 
+  bool supportsToIntermediate() const override {
+    return true;
+  }
+
+  void toIntermediate(
+      const SelectivityVector& rows,
+      std::vector<VectorPtr>& args,
+      VectorPtr& result) const override {
+    const auto& input = args[0];
+    if (rows.isAllSelected()) {
+      result = input;
+      return;
+    }
+
+    auto* pool = BaseAggregate::allocator_->pool();
+
+    result = BaseVector::create(input->type(), rows.size(), pool);
+    result->copy(input.get(), 0, 0, rows.size());
+
+    // Set result to NULL for rows that are masked out.
+    BufferPtr nulls = allocateNulls(rows.size(), pool, bits::kNull);
+    rows.clearNulls(nulls);
+    result->setNulls(nulls);
+  }
+
   void addRawInput(
       char** groups,
       const SelectivityVector& rows,
@@ -258,6 +283,33 @@ class NonNumericMinMaxAggregateBase : public exec::Aggregate {
     }
   }
 
+  bool supportsToIntermediate() const override {
+    return true;
+  }
+
+  void toIntermediate(
+      const SelectivityVector& rows,
+      std::vector<VectorPtr>& args,
+      VectorPtr& result) const override {
+    const auto& input = args[0];
+    if (rows.isAllSelected()) {
+      result = input;
+      return;
+    }
+
+    auto* pool = allocator_->pool();
+
+    // Set result to NULL for rows that are masked out.
+    BufferPtr nulls = allocateNulls(rows.size(), pool, bits::kNull);
+    rows.clearNulls(nulls);
+
+    BufferPtr indices = allocateIndices(rows.size(), pool);
+    auto* rawIndices = indices->asMutable<vector_size_t>();
+    std::iota(rawIndices, rawIndices + rows.size(), 0);
+
+    result = BaseVector::wrapInDictionary(nulls, indices, rows.size(), input);
+  }
+
   void extractValues(char** groups, int32_t numGroups, VectorPtr* result)
       override {
     VELOX_CHECK(result);
diff --git a/velox/functions/prestosql/aggregates/tests/ArrayAggTest.cpp b/velox/functions/prestosql/aggregates/tests/ArrayAggTest.cpp
index 232e60d1f64d..769bfd224414 100644
--- a/velox/functions/prestosql/aggregates/tests/ArrayAggTest.cpp
+++ b/velox/functions/prestosql/aggregates/tests/ArrayAggTest.cpp
@@ -60,6 +60,15 @@ TEST_F(ArrayAggTest, groupBy) {
       {"c0"},
       {"array_agg(a)"},
       "SELECT c0, array_agg(a) FROM tmp GROUP BY c0");
+
+  // Having one function supporting toIntermediate and one does not, make sure
+  // the row container is recreated with only the function wihtout
+  // toIntermediate support.
+  testAggregations(
+      batches,
+      {"c0"},
+      {"array_agg(a)", "max(c0)"},
+      "SELECT c0, array_agg(a), max(c0) FROM tmp GROUP BY c0");
 }
 
 TEST_F(ArrayAggTest, sortedGroupBy) {
diff --git a/velox/functions/prestosql/benchmarks/JsonExprBenchmark.cpp b/velox/functions/prestosql/benchmarks/JsonExprBenchmark.cpp
index fb0952e5992f..b798c2640f9e 100644
--- a/velox/functions/prestosql/benchmarks/JsonExprBenchmark.cpp
+++ b/velox/functions/prestosql/benchmarks/JsonExprBenchmark.cpp
@@ -51,6 +51,14 @@ class JsonBenchmark : public velox::functions::test::FunctionBenchmarkBase {
         {"folly_json_extract_scalar"});
     registerFunction<SIMDJsonExtractScalarFunction, Varchar, Json, Varchar>(
         {"simd_json_extract_scalar"});
+    registerFunction<JsonExtractFunction, Varchar, Json, Varchar>(
+        {"folly_json_extract"});
+    registerFunction<SIMDJsonExtractFunction, Varchar, Json, Varchar>(
+        {"simd_json_extract"});
+    registerFunction<JsonSizeFunction, int64_t, Json, Varchar>(
+        {"folly_json_size"});
+    registerFunction<SIMDJsonSizeFunction, int64_t, Json, Varchar>(
+        {"simd_json_size"});
   }
 
   std::string prepareData(int jsonSize) {
@@ -203,6 +211,42 @@ void SIMDJsonExtractScalar(int iter, int vectorSize, int jsonSize) {
       iter, vectorSize, "simd_json_extract_scalar", json, "$.key[7].k1");
 }
 
+void FollyJsonExtract(int iter, int vectorSize, int jsonSize) {
+  folly::BenchmarkSuspender suspender;
+  JsonBenchmark benchmark;
+  auto json = benchmark.prepareData(jsonSize);
+  suspender.dismiss();
+  benchmark.runWithJsonExtract(
+      iter, vectorSize, "folly_json_extract", json, "$.key[*].k1");
+}
+
+void SIMDJsonExtract(int iter, int vectorSize, int jsonSize) {
+  folly::BenchmarkSuspender suspender;
+  JsonBenchmark benchmark;
+  auto json = benchmark.prepareData(jsonSize);
+  suspender.dismiss();
+  benchmark.runWithJsonExtract(
+      iter, vectorSize, "simd_json_extract", json, "$.key[*].k1");
+}
+
+void FollyJsonSize(int iter, int vectorSize, int jsonSize) {
+  folly::BenchmarkSuspender suspender;
+  JsonBenchmark benchmark;
+  auto json = benchmark.prepareData(jsonSize);
+  suspender.dismiss();
+  benchmark.runWithJsonExtract(
+      iter, vectorSize, "folly_json_size", json, "$.key");
+}
+
+void SIMDJsonSize(int iter, int vectorSize, int jsonSize) {
+  folly::BenchmarkSuspender suspender;
+  JsonBenchmark benchmark;
+  auto json = benchmark.prepareData(jsonSize);
+  suspender.dismiss();
+  benchmark.runWithJsonExtract(
+      iter, vectorSize, "simd_json_size", json, "$.key");
+}
+
 BENCHMARK_DRAW_LINE();
 
 BENCHMARK_NAMED_PARAM(FollyIsJsonScalar, 100_iters_10bytes_size, 100, 10);
@@ -368,6 +412,64 @@ BENCHMARK_RELATIVE_NAMED_PARAM(
     10000);
 BENCHMARK_DRAW_LINE();
 
+BENCHMARK_DRAW_LINE();
+BENCHMARK_NAMED_PARAM(FollyJsonExtract, 100_iters_10bytes_size, 100, 10);
+BENCHMARK_RELATIVE_NAMED_PARAM(
+    SIMDJsonExtract,
+    100_iters_10bytes_size,
+    100,
+    10);
+BENCHMARK_DRAW_LINE();
+
+BENCHMARK_NAMED_PARAM(FollyJsonExtract, 100_iters_100bytes_size, 100, 100);
+BENCHMARK_RELATIVE_NAMED_PARAM(
+    SIMDJsonExtract,
+    100_iters_100bytes_size,
+    100,
+    100);
+BENCHMARK_DRAW_LINE();
+
+BENCHMARK_NAMED_PARAM(FollyJsonExtract, 100_iters_1000bytes_size, 100, 1000);
+BENCHMARK_RELATIVE_NAMED_PARAM(
+    SIMDJsonExtract,
+    100_iters_1000bytes_size,
+    100,
+    1000);
+BENCHMARK_DRAW_LINE();
+
+BENCHMARK_NAMED_PARAM(FollyJsonExtract, 100_iters_10000bytes_size, 100, 10000);
+BENCHMARK_RELATIVE_NAMED_PARAM(
+    SIMDJsonExtract,
+    100_iters_10000bytes_size,
+    100,
+    10000);
+BENCHMARK_DRAW_LINE();
+
+BENCHMARK_DRAW_LINE();
+BENCHMARK_NAMED_PARAM(FollyJsonSize, 100_iters_10bytes_size, 100, 10);
+BENCHMARK_RELATIVE_NAMED_PARAM(SIMDJsonSize, 100_iters_10bytes_size, 100, 10);
+BENCHMARK_DRAW_LINE();
+
+BENCHMARK_NAMED_PARAM(FollyJsonSize, 100_iters_100bytes_size, 100, 100);
+BENCHMARK_RELATIVE_NAMED_PARAM(SIMDJsonSize, 100_iters_100bytes_size, 100, 100);
+BENCHMARK_DRAW_LINE();
+
+BENCHMARK_NAMED_PARAM(FollyJsonSize, 100_iters_1000bytes_size, 100, 1000);
+BENCHMARK_RELATIVE_NAMED_PARAM(
+    SIMDJsonSize,
+    100_iters_1000bytes_size,
+    100,
+    1000);
+BENCHMARK_DRAW_LINE();
+
+BENCHMARK_NAMED_PARAM(FollyJsonSize, 100_iters_10000bytes_size, 100, 10000);
+BENCHMARK_RELATIVE_NAMED_PARAM(
+    SIMDJsonSize,
+    100_iters_10000bytes_size,
+    100,
+    10000);
+BENCHMARK_DRAW_LINE();
+
 } // namespace
 } // namespace facebook::velox::functions::prestosql
 
diff --git a/velox/functions/prestosql/registration/ArithmeticFunctionsRegistration.cpp b/velox/functions/prestosql/registration/ArithmeticFunctionsRegistration.cpp
index 64d2a4e53b23..5898a2fd77fa 100644
--- a/velox/functions/prestosql/registration/ArithmeticFunctionsRegistration.cpp
+++ b/velox/functions/prestosql/registration/ArithmeticFunctionsRegistration.cpp
@@ -109,8 +109,12 @@ void registerSimpleFunctions(const std::string& prefix) {
       {prefix + "binomial_cdf"});
   registerFunction<CauchyCDFFunction, double, double, double, double>(
       {prefix + "cauchy_cdf"});
+  registerFunction<ChiSquaredCDFFunction, double, double, double>(
+      {prefix + "chi_squared_cdf"});
   registerFunction<InverseBetaCDFFunction, double, double, double, double>(
       {prefix + "inverse_beta_cdf"});
+  registerFunction<FCDFFunction, double, double, double, double>(
+      {prefix + "f_cdf"});
 }
 
 } // namespace
diff --git a/velox/functions/prestosql/registration/ArrayFunctionsRegistration.cpp b/velox/functions/prestosql/registration/ArrayFunctionsRegistration.cpp
index 0e4604b200e3..b1ffd10271ee 100644
--- a/velox/functions/prestosql/registration/ArrayFunctionsRegistration.cpp
+++ b/velox/functions/prestosql/registration/ArrayFunctionsRegistration.cpp
@@ -194,11 +194,6 @@ void registerArrayFunctions(const std::string& prefix) {
       Array<Generic<T1>>,
       int64_t>({prefix + "trim_array"});
 
-  registerFunction<
-      ArrayUnionFunctionString,
-      Array<Varchar>,
-      Array<Varchar>,
-      Array<Varchar>>({prefix + "array_union"});
   registerArrayUnionFunctions<int8_t>(prefix);
   registerArrayUnionFunctions<int16_t>(prefix);
   registerArrayUnionFunctions<int32_t>(prefix);
diff --git a/velox/functions/prestosql/registration/JsonFunctionsRegistration.cpp b/velox/functions/prestosql/registration/JsonFunctionsRegistration.cpp
index bffda97f486a..763588418833 100644
--- a/velox/functions/prestosql/registration/JsonFunctionsRegistration.cpp
+++ b/velox/functions/prestosql/registration/JsonFunctionsRegistration.cpp
@@ -32,9 +32,9 @@ void registerJsonFunctions(const std::string& prefix) {
   registerFunction<SIMDJsonExtractScalarFunction, Varchar, Varchar, Varchar>(
       {prefix + "json_extract_scalar"});
 
-  registerFunction<JsonExtractFunction, Json, Json, Varchar>(
+  registerFunction<SIMDJsonExtractFunction, Json, Json, Varchar>(
       {prefix + "json_extract"});
-  registerFunction<JsonExtractFunction, Json, Varchar, Varchar>(
+  registerFunction<SIMDJsonExtractFunction, Json, Varchar, Varchar>(
       {prefix + "json_extract"});
 
   registerFunction<SIMDJsonArrayLengthFunction, int64_t, Json>(
@@ -59,9 +59,9 @@ void registerJsonFunctions(const std::string& prefix) {
   registerFunction<SIMDJsonArrayContainsFunction, bool, Varchar, Varchar>(
       {prefix + "json_array_contains"});
 
-  registerFunction<JsonSizeFunction, int64_t, Json, Varchar>(
+  registerFunction<SIMDJsonSizeFunction, int64_t, Json, Varchar>(
       {prefix + "json_size"});
-  registerFunction<JsonSizeFunction, int64_t, Varchar, Varchar>(
+  registerFunction<SIMDJsonSizeFunction, int64_t, Varchar, Varchar>(
       {prefix + "json_size"});
 
   VELOX_REGISTER_VECTOR_FUNCTION(udf_json_format, prefix + "json_format");
diff --git a/velox/functions/prestosql/tests/InPredicateTest.cpp b/velox/functions/prestosql/tests/InPredicateTest.cpp
index 3f54db94962f..026ca2cb8263 100644
--- a/velox/functions/prestosql/tests/InPredicateTest.cpp
+++ b/velox/functions/prestosql/tests/InPredicateTest.cpp
@@ -406,3 +406,157 @@ TEST_F(InPredicateTest, reusableResult) {
   auto expected = makeFlatVector<bool>({false, true, true, false});
   assertEqualVectors(expected, actual);
 }
+
+TEST_F(InPredicateTest, doubleWithZero) {
+  // zero and negative zero, FloatingPointRange
+  auto input = makeRowVector({
+      makeNullableFlatVector<double>({0.0, -0.0}, DOUBLE()),
+  });
+  auto predicate = "c0 IN ( 0.0 )";
+  auto result = evaluate<SimpleVector<bool>>(predicate, input);
+  auto expected = makeNullableFlatVector<bool>({true, true});
+  assertEqualVectors(expected, result);
+
+  // zero and negative zero, BigintValuesUsingHashTable, 0 in valuesList
+  input = makeRowVector({
+      makeNullableFlatVector<double>({0.0, -0.0}, DOUBLE()),
+  });
+  predicate = "c0 IN ( 0.0, 1.2, 2.3 )";
+  result = evaluate<SimpleVector<bool>>(predicate, input);
+  expected = makeNullableFlatVector<bool>({true, true});
+  assertEqualVectors(expected, result);
+
+  // zero and negative zero, BigintValuesUsingHashTable, -0 in valuesList
+  input = makeRowVector({
+      makeNullableFlatVector<double>({0.0, -0.0}, DOUBLE()),
+  });
+  predicate = "c0 IN ( -0.0, 1.2, 2.3, null )";
+  result = evaluate<SimpleVector<bool>>(predicate, input);
+  expected = makeNullableFlatVector<bool>({true, true});
+  assertEqualVectors(expected, result);
+
+  // TODO : zero and negative zero, BigintValuesUsingBitmask, depending on
+  // another fix
+}
+
+TEST_F(InPredicateTest, double) {
+  // No Null
+  auto input = makeRowVector({
+      makeNullableFlatVector<double>({1.2, 2.3, 3.4}, DOUBLE()),
+  });
+  std::string predicate = "c0 IN ( 1.2, 2.3, 3.4 )";
+  auto expected = makeConstant(true, input->size());
+  auto result = evaluate<SimpleVector<bool>>(predicate, input);
+  assertEqualVectors(expected, result);
+
+  // InList has Null
+  // Since there is only one non-null float, it will use FloatingPointRange
+  input = makeRowVector({
+      makeNullableFlatVector<double>({1.2, 2.3, 3.4}, DOUBLE()),
+  });
+  predicate = "c0 IN ( 1.2, null )";
+  result = evaluate<SimpleVector<bool>>(predicate, input);
+  expected = makeNullableFlatVector<bool>({true, std::nullopt, std::nullopt});
+  assertEqualVectors(expected, result);
+
+  // InList has Null
+  // Multiple non-null, using BigintValuesUsingHashTable
+  predicate = "c0 IN ( 1.2, 2.3, null )";
+  result = evaluate<SimpleVector<bool>>(predicate, input);
+  expected = makeNullableFlatVector<bool>({true, true, std::nullopt});
+  assertEqualVectors(expected, result);
+
+  // Value(input) has NULL
+  input = makeRowVector({
+      makeNullableFlatVector<double>({1.2, 1.3, std::nullopt}, DOUBLE()),
+  });
+  predicate = "c0 IN ( 1.2, 2.3 )";
+  result = evaluate<SimpleVector<bool>>(predicate, input);
+  expected = makeNullableFlatVector<bool>({true, false, std::nullopt});
+  assertEqualVectors(expected, result);
+
+  // NaN
+  input = makeRowVector({
+      makeNullableFlatVector<double>({std::nan("")}, DOUBLE()),
+  });
+  predicate = "c0 IN ( 1.2, 2.3 )";
+  result = evaluate<SimpleVector<bool>>(predicate, input);
+  expected = makeNullableFlatVector<bool>({false});
+  assertEqualVectors(expected, result);
+
+  predicate = "c0 IN ( 1.2, null )";
+  result = evaluate<SimpleVector<bool>>(predicate, input);
+  expected = makeNullableFlatVector<bool>({std::nullopt});
+  assertEqualVectors(expected, result);
+
+  // Infinity
+  input = makeRowVector({
+      makeNullableFlatVector<double>(
+          {std::numeric_limits<double>::infinity()}, DOUBLE()),
+  });
+  predicate = "c0 IN ( 1.2, 2.3 )";
+  result = evaluate<SimpleVector<bool>>(predicate, input);
+  expected = makeNullableFlatVector<bool>({false});
+  assertEqualVectors(expected, result);
+}
+
+TEST_F(InPredicateTest, float) {
+  // No Null
+  auto input = makeRowVector({
+      makeNullableFlatVector<float>({1.2, 2.3, 3.4}, REAL()),
+  });
+  std::string predicate =
+      "c0 IN ( CAST(1.2 AS REAL), CAST(2.3 AS REAL), CAST(3.4 AS REAL) )";
+  auto expected = makeConstant(true, input->size());
+  auto result = evaluate<SimpleVector<bool>>(predicate, input);
+  assertEqualVectors(expected, result);
+
+  /// InList has Null
+  // Since there is only one non-null float, it will use FloatingPointRange
+  predicate = "c0 IN ( CAST(1.2 AS REAL), null )";
+  result = evaluate<SimpleVector<bool>>(predicate, input);
+  expected = makeNullableFlatVector<bool>({true, std::nullopt, std::nullopt});
+  assertEqualVectors(expected, result);
+
+  // InList has Null
+  // Multiple non-null, using BigintValuesUsingHashTable
+  // TODO: CAST(1.2 AS REAL), CAST(1.2 AS REAL) captured a bug in
+  // BigintValuesUsingBitmask, it will be fixed in separate diff
+  predicate = "c0 IN ( CAST(1.2 AS REAL), CAST(1.3 AS REAL), null )";
+  result = evaluate<SimpleVector<bool>>(predicate, input);
+  expected = makeNullableFlatVector<bool>({true, std::nullopt, std::nullopt});
+  assertEqualVectors(expected, result);
+
+  // Value(input) has NULL
+  input = makeRowVector({
+      makeNullableFlatVector<float>({1.2, 2.3, std::nullopt}, REAL()),
+  });
+  predicate = "c0 IN ( CAST(1.2 AS REAL), CAST(1.3 AS REAL) )";
+  result = evaluate<SimpleVector<bool>>(predicate, input);
+  expected = makeNullableFlatVector<bool>({true, false, std::nullopt});
+  assertEqualVectors(expected, result);
+
+  // NaN
+  input = makeRowVector({
+      makeNullableFlatVector<float>({std::nan("")}, REAL()),
+  });
+  predicate = "c0 IN ( CAST(1.2 AS REAL), CAST(1.3 AS REAL) )";
+  result = evaluate<SimpleVector<bool>>(predicate, input);
+  expected = makeNullableFlatVector<bool>({false});
+  assertEqualVectors(expected, result);
+
+  predicate = "c0 IN ( CAST(1.2 AS REAL), null )";
+  result = evaluate<SimpleVector<bool>>(predicate, input);
+  expected = makeNullableFlatVector<bool>({std::nullopt});
+  assertEqualVectors(expected, result);
+
+  // Infinity
+  input = makeRowVector({
+      makeNullableFlatVector<float>(
+          {std::numeric_limits<float>::infinity()}, REAL()),
+  });
+  predicate = "c0 IN ( CAST(1.2 AS REAL), CAST(1.3 AS REAL) )";
+  result = evaluate<SimpleVector<bool>>(predicate, input);
+  expected = makeNullableFlatVector<bool>({false});
+  assertEqualVectors(expected, result);
+}
diff --git a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp
index ef59bb7b1638..68eb37d39cd3 100644
--- a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp
+++ b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp
@@ -538,10 +538,10 @@ TEST_F(JsonFunctionsTest, jsonExtract) {
   };
 
   EXPECT_EQ(
-      "{\"x\":{\"a\":1,\"b\":2}}",
+      "{\"x\": {\"a\" : 1, \"b\" : 2} }",
       jsonExtract("{\"x\": {\"a\" : 1, \"b\" : 2} }", "$"));
   EXPECT_EQ(
-      "{\"a\":1,\"b\":2}",
+      "{\"a\" : 1, \"b\" : 2}",
       jsonExtract("{\"x\": {\"a\" : 1, \"b\" : 2} }", "$.x"));
   EXPECT_EQ("1", jsonExtract("{\"x\": {\"a\" : 1, \"b\" : 2} }", "$.x.a"));
   EXPECT_EQ(
diff --git a/velox/functions/prestosql/tests/ProbabilityTest.cpp b/velox/functions/prestosql/tests/ProbabilityTest.cpp
index 5b31d20f6947..2906992cf5cb 100644
--- a/velox/functions/prestosql/tests/ProbabilityTest.cpp
+++ b/velox/functions/prestosql/tests/ProbabilityTest.cpp
@@ -221,5 +221,64 @@ TEST_F(ProbabilityTest, invBetaCDF) {
   VELOX_ASSERT_THROW(invBetaCDF(3, 5, 1.1), "p must be in the interval [0, 1]");
 }
 
+TEST_F(ProbabilityTest, chiSquaredCDF) {
+  const auto chiSquaredCDF = [&](std::optional<double> df,
+                                 std::optional<double> value) {
+    return evaluateOnce<double>("chi_squared_cdf(c0, c1)", df, value);
+  };
+
+  EXPECT_EQ(chiSquaredCDF(3, 0.0), 0.0);
+  EXPECT_EQ(chiSquaredCDF(3, 1.0), 0.1987480430987992);
+  EXPECT_EQ(chiSquaredCDF(3, 2.5), 0.52470891665697938);
+  EXPECT_EQ(chiSquaredCDF(3, 4), 0.73853587005088939);
+  // Invalid inputs
+  VELOX_ASSERT_THROW(chiSquaredCDF(-3, 0.3), "df must be greater than 0");
+  VELOX_ASSERT_THROW(chiSquaredCDF(3, -10), "value must non-negative");
+}
+
+TEST_F(ProbabilityTest, fCDF) {
+  const auto fCDF = [&](std::optional<double> df1,
+                        std::optional<double> df2,
+                        std::optional<double> value) {
+    return evaluateOnce<double>("f_cdf(c0, c1, c2)", df1, df2, value);
+  };
+
+  EXPECT_EQ(fCDF(2.0, 5.0, 0.0), 0.0);
+  EXPECT_EQ(fCDF(2.0, 5.0, 0.7988), 0.50001145221750731);
+  EXPECT_EQ(fCDF(2.0, 5.0, 3.7797), 0.89999935988961155);
+
+  EXPECT_EQ(fCDF(kDoubleMax, 5.0, 3.7797), 1);
+  EXPECT_EQ(fCDF(1, kDoubleMax, 97.1), 1);
+  EXPECT_EQ(fCDF(82.6, 901.10, kDoubleMax), 1);
+  EXPECT_EQ(fCDF(12.12, 4.2015, kDoubleMin), 0);
+  EXPECT_EQ(fCDF(0.4422, kDoubleMin, 0.697), 7.9148959162596482e-306);
+  EXPECT_EQ(fCDF(kDoubleMin, 50.620, 4), 1);
+  EXPECT_EQ(fCDF(kBigIntMax, 5.0, 3.7797), 0.93256230095450132);
+  EXPECT_EQ(fCDF(76.901, kBigIntMax, 77.97), 1);
+  EXPECT_EQ(fCDF(2.0, 5.0, kBigIntMax), 1);
+
+  EXPECT_EQ(fCDF(2.0, 5.0, std::nullopt), std::nullopt);
+  EXPECT_EQ(fCDF(2.0, std::nullopt, 3.7797), std::nullopt);
+  EXPECT_EQ(fCDF(std::nullopt, 5.0, 3.7797), std::nullopt);
+
+  // Test invalid inputs for df1.
+  VELOX_ASSERT_THROW(fCDF(0, 3, 0.5), "numerator df must be greater than 0");
+  VELOX_ASSERT_THROW(
+      fCDF(kBigIntMin, 5.0, 3.7797), "numerator df must be greater than 0");
+
+  // Test invalid inputs for df2.
+  VELOX_ASSERT_THROW(fCDF(3, 0, 0.5), "denominator df must be greater than 0");
+  VELOX_ASSERT_THROW(
+      fCDF(2.0, kBigIntMin, 3.7797), "denominator df must be greater than 0");
+
+  // Test invalid inputs for value.
+  VELOX_ASSERT_THROW(fCDF(3, 5, -0.1), "value must non-negative");
+  VELOX_ASSERT_THROW(fCDF(2.0, 5.0, kBigIntMin), "value must non-negative");
+
+  // Test a combination of invalid inputs.
+  VELOX_ASSERT_THROW(fCDF(-1.2, 0, -0.1), "value must non-negative");
+  VELOX_ASSERT_THROW(fCDF(1, -kInf, -0.1), "value must non-negative");
+}
+
 } // namespace
 } // namespace facebook::velox
diff --git a/velox/functions/sparksql/String.h b/velox/functions/sparksql/String.h
index 6e935d2af79d..c3e4b67386fd 100644
--- a/velox/functions/sparksql/String.h
+++ b/velox/functions/sparksql/String.h
@@ -81,18 +81,29 @@ struct AsciiFunction {
   }
 };
 
+/// chr function
+/// chr(n) -> string
+/// Returns the Unicode code point ``n`` as a single character string.
+/// If ``n < 0``, the result is an empty string.
+/// If ``n >= 256``, the result is equivalent to chr(``n % 256``).
 template <typename T>
 struct ChrFunction {
   VELOX_DEFINE_FUNCTION_TYPES(T);
 
-  FOLLY_ALWAYS_INLINE bool call(out_type<Varchar>& result, int64_t ord) {
-    if (ord < 0) {
+  FOLLY_ALWAYS_INLINE void call(out_type<Varchar>& result, int64_t n) {
+    if (n < 0) {
       result.resize(0);
     } else {
-      result.resize(1);
-      *result.data() = ord;
+      n = n & 0xFF;
+      if (n < 0x80) {
+        result.resize(1);
+        result.data()[0] = n;
+      } else {
+        result.resize(2);
+        result.data()[0] = 0xC0 + (n >> 6);
+        result.data()[1] = 0x80 + (n & 0x3F);
+      }
     }
-    return true;
   }
 };
 
diff --git a/velox/functions/sparksql/tests/StringTest.cpp b/velox/functions/sparksql/tests/StringTest.cpp
index 71e3ab8e4586..9fc038bc5bfa 100644
--- a/velox/functions/sparksql/tests/StringTest.cpp
+++ b/velox/functions/sparksql/tests/StringTest.cpp
@@ -201,11 +201,16 @@ TEST_F(StringTest, Ascii) {
 }
 
 TEST_F(StringTest, Chr) {
-  EXPECT_EQ(chr(0), std::string("\0", 1));
-  EXPECT_EQ(chr(32), " ");
   EXPECT_EQ(chr(-16), "");
-  EXPECT_EQ(chr(256), std::string("\0", 1));
-  EXPECT_EQ(chr(256 + 32), std::string(" ", 1));
+  EXPECT_EQ(chr(0), std::string("\0", 1));
+  EXPECT_EQ(chr(0x100), std::string("\0", 1));
+  EXPECT_EQ(chr(0x1100), std::string("\0", 1));
+  EXPECT_EQ(chr(0x20), "\x20");
+  EXPECT_EQ(chr(0x100 + 0x20), "\x20");
+  EXPECT_EQ(chr(0x80), "\xC2\x80");
+  EXPECT_EQ(chr(0x100 + 0x80), "\xC2\x80");
+  EXPECT_EQ(chr(0xFF), "\xC3\xBF");
+  EXPECT_EQ(chr(0x100 + 0xFF), "\xC3\xBF");
   EXPECT_EQ(chr(std::nullopt), std::nullopt);
 }
 
diff --git a/velox/row/CMakeLists.txt b/velox/row/CMakeLists.txt
index 2f57248047d5..03dff1030636 100644
--- a/velox/row/CMakeLists.txt
+++ b/velox/row/CMakeLists.txt
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_library(velox_row_fast UnsafeRowFast.cpp)
+add_library(velox_row_fast UnsafeRowFast.cpp CompactRow.cpp)
 
 target_link_libraries(velox_row_fast velox_vector)
 
diff --git a/velox/row/CompactRow.cpp b/velox/row/CompactRow.cpp
new file mode 100644
index 000000000000..04196096a18c
--- /dev/null
+++ b/velox/row/CompactRow.cpp
@@ -0,0 +1,961 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "velox/row/CompactRow.h"
+#include "velox/vector/FlatVector.h"
+
+namespace facebook::velox::row {
+
+CompactRow::CompactRow(const RowVectorPtr& vector)
+    : typeKind_{vector->typeKind()}, decoded_{*vector} {
+  initialize(vector->type());
+}
+
+CompactRow::CompactRow(const VectorPtr& vector)
+    : typeKind_{vector->typeKind()}, decoded_{*vector} {
+  initialize(vector->type());
+}
+
+void CompactRow::initialize(const TypePtr& type) {
+  auto base = decoded_.base();
+  switch (typeKind_) {
+    case TypeKind::ARRAY: {
+      auto arrayBase = base->as<ArrayVector>();
+      children_.push_back(CompactRow(arrayBase->elements()));
+      childIsFixedWidth_.push_back(
+          arrayBase->elements()->type()->isFixedWidth());
+      break;
+    }
+    case TypeKind::MAP: {
+      auto mapBase = base->as<MapVector>();
+      children_.push_back(CompactRow(mapBase->mapKeys()));
+      children_.push_back(CompactRow(mapBase->mapValues()));
+      childIsFixedWidth_.push_back(mapBase->mapKeys()->type()->isFixedWidth());
+      childIsFixedWidth_.push_back(
+          mapBase->mapValues()->type()->isFixedWidth());
+      break;
+    }
+    case TypeKind::ROW: {
+      auto rowBase = base->as<RowVector>();
+      for (const auto& child : rowBase->children()) {
+        children_.push_back(CompactRow(child));
+        childIsFixedWidth_.push_back(child->type()->isFixedWidth());
+      }
+
+      rowNullBytes_ = bits::nbytes(type->size());
+      break;
+    }
+    case TypeKind::BOOLEAN:
+      valueBytes_ = 1;
+      fixedWidthTypeKind_ = true;
+      break;
+    case TypeKind::TINYINT:
+      FOLLY_FALLTHROUGH;
+    case TypeKind::SMALLINT:
+      FOLLY_FALLTHROUGH;
+    case TypeKind::INTEGER:
+      FOLLY_FALLTHROUGH;
+    case TypeKind::BIGINT:
+      FOLLY_FALLTHROUGH;
+    case TypeKind::HUGEINT:
+      FOLLY_FALLTHROUGH;
+    case TypeKind::REAL:
+      FOLLY_FALLTHROUGH;
+    case TypeKind::DOUBLE:
+      valueBytes_ = type->cppSizeInBytes();
+      fixedWidthTypeKind_ = true;
+      supportsBulkCopy_ = decoded_.isIdentityMapping();
+      break;
+    case TypeKind::TIMESTAMP:
+      valueBytes_ = sizeof(int64_t);
+      fixedWidthTypeKind_ = true;
+      break;
+    case TypeKind::VARCHAR:
+      FOLLY_FALLTHROUGH;
+    case TypeKind::VARBINARY:
+      // Nothing to do.
+      break;
+    case TypeKind::UNKNOWN:
+      // UNKNOWN values are always nulls, hence, do not take up space.
+      valueBytes_ = 0;
+      fixedWidthTypeKind_ = true;
+      supportsBulkCopy_ = true;
+      break;
+    default:
+      VELOX_UNSUPPORTED("Unsupported type: {}", type->toString());
+  }
+}
+
+// static
+std::optional<int32_t> CompactRow::fixedRowSize(const RowTypePtr& rowType) {
+  const size_t numFields = rowType->size();
+  const size_t nullLength = bits::nbytes(numFields);
+
+  size_t size = nullLength;
+  for (const auto& child : rowType->children()) {
+    if (child->isTimestamp()) {
+      size += sizeof(int64_t);
+    } else if (child->isFixedWidth()) {
+      size += child->cppSizeInBytes();
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  return size;
+}
+
+int32_t CompactRow::rowSize(vector_size_t index) {
+  return rowRowSize(index);
+}
+
+int32_t CompactRow::rowRowSize(vector_size_t index) {
+  auto childIndex = decoded_.index(index);
+
+  const auto numFields = children_.size();
+  int32_t size = rowNullBytes_;
+
+  for (auto i = 0; i < numFields; ++i) {
+    if (childIsFixedWidth_[i]) {
+      size += children_[i].valueBytes_;
+    } else if (!children_[i].isNullAt(childIndex)) {
+      size += children_[i].variableWidthRowSize(childIndex);
+    }
+  }
+
+  return size;
+}
+
+int32_t CompactRow::serializeRow(vector_size_t index, char* buffer) {
+  auto childIndex = decoded_.index(index);
+
+  int64_t valuesOffset = rowNullBytes_;
+
+  auto* nulls = reinterpret_cast<uint8_t*>(buffer);
+
+  for (auto i = 0; i < children_.size(); ++i) {
+    auto& child = children_[i];
+
+    // Write null bit. Advance offset if 'fixed-width'.
+    if (child.isNullAt(childIndex)) {
+      bits::setBit(nulls, i, true);
+      if (childIsFixedWidth_[i]) {
+        valuesOffset += child.valueBytes_;
+      }
+      continue;
+    }
+
+    if (childIsFixedWidth_[i]) {
+      // Write fixed-width value.
+      if (child.valueBytes_ > 0) {
+        child.serializeFixedWidth(childIndex, buffer + valuesOffset);
+      }
+      valuesOffset += child.valueBytes_;
+    } else {
+      // Write non-null variable-width value.
+      auto size =
+          child.serializeVariableWidth(childIndex, buffer + valuesOffset);
+      valuesOffset += size;
+    }
+  }
+
+  return valuesOffset;
+}
+
+bool CompactRow::isNullAt(vector_size_t index) {
+  return decoded_.isNullAt(index);
+}
+
+int32_t CompactRow::variableWidthRowSize(vector_size_t index) {
+  switch (typeKind_) {
+    case TypeKind::VARCHAR:
+      FOLLY_FALLTHROUGH;
+    case TypeKind::VARBINARY: {
+      auto value = decoded_.valueAt<StringView>(index);
+      return sizeof(int32_t) + value.size();
+    }
+    case TypeKind::ARRAY:
+      return arrayRowSize(index);
+    case TypeKind::MAP:
+      return mapRowSize(index);
+    case TypeKind::ROW:
+      return rowRowSize(index);
+    default:
+      VELOX_UNREACHABLE(
+          "Unexpected type kind: {}", mapTypeKindToName(typeKind_));
+  };
+}
+
+int32_t CompactRow::arrayRowSize(vector_size_t index) {
+  auto baseIndex = decoded_.index(index);
+
+  auto arrayBase = decoded_.base()->asUnchecked<ArrayVector>();
+  auto offset = arrayBase->offsetAt(baseIndex);
+  auto size = arrayBase->sizeAt(baseIndex);
+
+  return arrayRowSize(children_[0], offset, size, childIsFixedWidth_[0]);
+}
+
+int32_t CompactRow::arrayRowSize(
+    CompactRow& elements,
+    vector_size_t offset,
+    vector_size_t size,
+    bool fixedWidth) {
+  const int32_t nullBytes = bits::nbytes(size);
+
+  // array size | null bits | elements
+
+  // 4 bytes for number of elements, some bytes for null flags.
+  int32_t rowSize = sizeof(int32_t) + nullBytes;
+  if (fixedWidth) {
+    return rowSize + size * elements.valueBytes();
+  }
+
+  if (size == 0) {
+    return rowSize;
+  }
+
+  // If element type is a complex type, then add 4 bytes for overall serialized
+  // size of the array + 4 bytes per element for offset of the serialized
+  // element.
+  // size | nulls | serialized size | serialized offset 1 | serialized offset 2
+  // |...| element 1 | element 2 |...
+
+  if (!(elements.typeKind_ == TypeKind::VARCHAR ||
+        elements.typeKind_ == TypeKind::VARBINARY)) {
+    // 4 bytes for the overall serialized size + 4 bytes for the offset of each
+    // element.
+    rowSize += sizeof(int32_t) + size * sizeof(int32_t);
+  }
+
+  for (auto i = 0; i < size; ++i) {
+    if (!elements.isNullAt(offset + i)) {
+      rowSize += elements.variableWidthRowSize(offset + i);
+    }
+  }
+
+  return rowSize;
+}
+
+int32_t CompactRow::serializeArray(vector_size_t index, char* buffer) {
+  auto baseIndex = decoded_.index(index);
+
+  // For complex-type elements:
+  // array size | null bits | serialized size | offset e1 | offset e2 |... | e1
+  // | e2 |...
+  //
+  // 'serialized size' is the number of bytes starting after null bits and to
+  // the end of the array. Offsets are specified relative to position right
+  // after 'serialized size'.
+  //
+  // For fixed-width or string element type:
+  // array size | null bite | e1 | e2 |...
+
+  auto arrayBase = decoded_.base()->asUnchecked<ArrayVector>();
+  auto offset = arrayBase->offsetAt(baseIndex);
+  auto size = arrayBase->sizeAt(baseIndex);
+
+  return serializeAsArray(
+      children_[0], offset, size, childIsFixedWidth_[0], buffer);
+}
+
+namespace {
+
+constexpr size_t kSizeBytes = sizeof(int32_t);
+
+void writeInt32(char* buffer, int32_t n) {
+  memcpy(buffer, &n, sizeof(int32_t));
+}
+
+int32_t readInt32(const char* buffer) {
+  int32_t n;
+  memcpy(&n, buffer, sizeof(int32_t));
+  return n;
+}
+} // namespace
+
+int32_t CompactRow::serializeAsArray(
+    CompactRow& elements,
+    vector_size_t offset,
+    vector_size_t size,
+    bool fixedWidth,
+    char* buffer) {
+  // For complex-type elements:
+  // array size | null bits | serialized size | offset e1 | offset e2 |... | e1
+  // | e2 |...
+  //
+  // For fixed-width and string element types:
+  // array size | null bits | e1 | e2 |...
+
+  // Write array size.
+  writeInt32(buffer, size);
+
+  // Write null flags.
+  const int32_t nullBytes = bits::nbytes(size);
+  const int32_t nullsOffset = kSizeBytes;
+
+  int32_t elementsOffset = nullsOffset + nullBytes;
+
+  auto* rawNulls = reinterpret_cast<uint8_t*>(buffer + nullsOffset);
+
+  if (elements.supportsBulkCopy_) {
+    if (elements.decoded_.mayHaveNulls()) {
+      for (auto i = 0; i < size; ++i) {
+        if (elements.isNullAt(offset + i)) {
+          bits::setBit(rawNulls, i, true);
+        }
+      }
+    }
+    elements.serializeFixedWidth(offset, size, buffer + elementsOffset);
+    return elementsOffset + size * elements.valueBytes_;
+  }
+
+  if (fixedWidth) {
+    for (auto i = 0; i < size; ++i) {
+      if (elements.isNullAt(offset + i)) {
+        bits::setBit(rawNulls, i, true);
+      } else {
+        elements.serializeFixedWidth(offset + i, buffer + elementsOffset);
+      }
+      elementsOffset += elements.valueBytes_;
+    }
+  } else if (
+      elements.typeKind_ == TypeKind::VARCHAR ||
+      elements.typeKind_ == TypeKind::VARBINARY) {
+    for (auto i = 0; i < size; ++i) {
+      if (elements.isNullAt(offset + i)) {
+        bits::setBit(rawNulls, i, true);
+      } else {
+        auto serializedBytes = elements.serializeVariableWidth(
+            offset + i, buffer + elementsOffset);
+        elementsOffset += serializedBytes;
+      }
+    }
+  } else {
+    if (size > 0) {
+      // Leave room for serialized size and offsets.
+      const size_t baseOffset = elementsOffset + kSizeBytes;
+      elementsOffset += kSizeBytes + size * kSizeBytes;
+
+      for (auto i = 0; i < size; ++i) {
+        if (elements.isNullAt(offset + i)) {
+          bits::setBit(rawNulls, i, true);
+        } else {
+          writeInt32(
+              buffer + baseOffset + i * kSizeBytes,
+              elementsOffset - baseOffset);
+
+          auto serializedBytes = elements.serializeVariableWidth(
+              offset + i, buffer + elementsOffset);
+
+          elementsOffset += serializedBytes;
+        }
+      }
+
+      writeInt32(buffer + baseOffset - kSizeBytes, elementsOffset - baseOffset);
+    }
+  }
+
+  return elementsOffset;
+}
+
+int32_t CompactRow::mapRowSize(vector_size_t index) {
+  auto baseIndex = decoded_.index(index);
+
+  //  <keys array> | <values array>
+
+  auto mapBase = decoded_.base()->asUnchecked<MapVector>();
+  auto offset = mapBase->offsetAt(baseIndex);
+  auto size = mapBase->sizeAt(baseIndex);
+
+  return arrayRowSize(children_[0], offset, size, childIsFixedWidth_[0]) +
+      arrayRowSize(children_[1], offset, size, childIsFixedWidth_[1]);
+}
+
+int32_t CompactRow::serializeMap(vector_size_t index, char* buffer) {
+  auto baseIndex = decoded_.index(index);
+
+  //  <keys array> | <values array>
+
+  auto mapBase = decoded_.base()->asUnchecked<MapVector>();
+  auto offset = mapBase->offsetAt(baseIndex);
+  auto size = mapBase->sizeAt(baseIndex);
+
+  auto keysSerializedBytes = serializeAsArray(
+      children_[0], offset, size, childIsFixedWidth_[0], buffer);
+
+  auto valuesSerializedBytes = serializeAsArray(
+      children_[1],
+      offset,
+      size,
+      childIsFixedWidth_[1],
+      buffer + keysSerializedBytes);
+
+  return keysSerializedBytes + valuesSerializedBytes;
+}
+
+int32_t CompactRow::serialize(vector_size_t index, char* buffer) {
+  return serializeRow(index, buffer);
+}
+
+void CompactRow::serializeFixedWidth(vector_size_t index, char* buffer) {
+  VELOX_DCHECK(fixedWidthTypeKind_);
+  switch (typeKind_) {
+    case TypeKind::BOOLEAN:
+      *reinterpret_cast<bool*>(buffer) = decoded_.valueAt<bool>(index);
+      break;
+    case TypeKind::TIMESTAMP: {
+      auto micros = decoded_.valueAt<Timestamp>(index).toMicros();
+      memcpy(buffer, &micros, sizeof(int64_t));
+      break;
+    }
+    default:
+      memcpy(
+          buffer,
+          decoded_.data<char>() + decoded_.index(index) * valueBytes_,
+          valueBytes_);
+  }
+}
+
+void CompactRow::serializeFixedWidth(
+    vector_size_t offset,
+    vector_size_t size,
+    char* buffer) {
+  VELOX_DCHECK(supportsBulkCopy_);
+  // decoded_.data<char>() can be null if all values are null.
+  if (decoded_.data<char>()) {
+    memcpy(
+        buffer,
+        decoded_.data<char>() + decoded_.index(offset) * valueBytes_,
+        valueBytes_ * size);
+  }
+}
+
+int32_t CompactRow::serializeVariableWidth(vector_size_t index, char* buffer) {
+  switch (typeKind_) {
+    case TypeKind::VARCHAR:
+      FOLLY_FALLTHROUGH;
+    case TypeKind::VARBINARY: {
+      auto value = decoded_.valueAt<StringView>(index);
+      writeInt32(buffer, value.size());
+      if (!value.empty()) {
+        memcpy(buffer + kSizeBytes, value.data(), value.size());
+      }
+      return kSizeBytes + value.size();
+    }
+    case TypeKind::ARRAY:
+      return serializeArray(index, buffer);
+    case TypeKind::MAP:
+      return serializeMap(index, buffer);
+    case TypeKind::ROW:
+      return serializeRow(index, buffer);
+    default:
+      VELOX_UNREACHABLE(
+          "Unexpected type kind: {}", mapTypeKindToName(typeKind_));
+  };
+}
+
+namespace {
+
+// Reads single fixed-width value from buffer and returns number of bytes read.
+// Stores the value into flatVector[index].
+template <typename T>
+size_t readFixedWidthValue(
+    bool isNull,
+    const char* buffer,
+    FlatVector<T>* flatVector,
+    vector_size_t index) {
+  if (isNull) {
+    flatVector->setNull(index, true);
+  } else if constexpr (std::is_same_v<T, Timestamp>) {
+    int64_t micros;
+    memcpy(&micros, buffer, sizeof(int64_t));
+    flatVector->set(index, Timestamp::fromMicros(micros));
+  } else {
+    T value;
+    memcpy(&value, buffer, sizeof(T));
+    flatVector->set(index, value);
+  }
+
+  if constexpr (std::is_same_v<T, Timestamp>) {
+    return sizeof(int64_t);
+  } else {
+    return sizeof(T);
+  }
+}
+
+// Deserializes one fixed-width value from each 'row' in 'data'.
+// Each value starts at data[row].data() + offsets[row].
+//
+// @param nulls Null flags for the values.
+// @param offsets In/out parameter that specifies offsets in 'data' for the
+// serialized values. Advances past the serialized value.
+template <TypeKind Kind>
+VectorPtr deserializeFixedWidth(
+    const TypePtr& type,
+    const std::vector<std::string_view>& data,
+    const BufferPtr& nulls,
+    std::vector<size_t>& offsets,
+    memory::MemoryPool* pool) {
+  using T = typename TypeTraits<Kind>::NativeType;
+
+  const auto numRows = data.size();
+  auto flatVector = BaseVector::create<FlatVector<T>>(type, numRows, pool);
+
+  auto* rawNulls = nulls->as<uint64_t>();
+
+  for (auto i = 0; i < numRows; ++i) {
+    offsets[i] += readFixedWidthValue<T>(
+        bits::isBitNull(rawNulls, i),
+        data[i].data() + offsets[i],
+        flatVector.get(),
+        i);
+  }
+
+  return flatVector;
+}
+
+vector_size_t totalSize(const vector_size_t* rawSizes, size_t numRows) {
+  vector_size_t total = 0;
+  for (auto i = 0; i < numRows; ++i) {
+    total += rawSizes[i];
+  }
+  return total;
+}
+
+const uint8_t* readNulls(const char* buffer) {
+  return reinterpret_cast<const uint8_t*>(buffer);
+}
+
+// Deserializes multiple fixed-width values from each 'row' in 'data'.
+// Each set of values starts at data[row].data() + offsets[row] and contains
+// null flags followed by values. The number of values is provided in
+// sizes[row].
+// nulls | v1 | v2 | v3 |...
+// Advances offsets past the last value.
+template <TypeKind Kind>
+VectorPtr deserializeFixedWidthArrays(
+    const TypePtr& type,
+    const std::vector<std::string_view>& data,
+    const BufferPtr& sizes,
+    std::vector<size_t>& offsets,
+    memory::MemoryPool* pool) {
+  using T = typename TypeTraits<Kind>::NativeType;
+
+  const auto numRows = data.size();
+  auto* rawSizes = sizes->as<vector_size_t>();
+
+  const auto total = totalSize(rawSizes, numRows);
+
+  auto flatVector = BaseVector::create<FlatVector<T>>(type, total, pool);
+
+  vector_size_t index = 0;
+  for (auto i = 0; i < numRows; ++i) {
+    const auto size = rawSizes[i];
+    if (size > 0) {
+      auto nullBytes = bits::nbytes(size);
+
+      auto* rawElementNulls = readNulls(data[i].data() + offsets[i]);
+
+      offsets[i] += nullBytes;
+
+      for (auto j = 0; j < size; ++j) {
+        offsets[i] += readFixedWidthValue<T>(
+            bits::isBitSet(rawElementNulls, j),
+            data[i].data() + offsets[i],
+            flatVector.get(),
+            index);
+        ++index;
+      }
+    }
+  }
+
+  return flatVector;
+}
+
+int32_t readString(
+    const char* buffer,
+    FlatVector<StringView>* flatVector,
+    vector_size_t index) {
+  int32_t size = readInt32(buffer);
+  StringView value(buffer + kSizeBytes, size);
+  flatVector->set(index, value);
+  return kSizeBytes + size;
+}
+
+VectorPtr deserializeUnknowns(
+    const TypePtr& type,
+    const std::vector<std::string_view>& data,
+    const BufferPtr& nulls,
+    std::vector<size_t>& offsets,
+    memory::MemoryPool* pool) {
+  return BaseVector::createNullConstant(UNKNOWN(), data.size(), pool);
+}
+
+// Deserializes one string from each 'row' in 'data'.
+// Each strings starts at data[row].data() + offsets[row].
+// string size | <string bytes>
+// Advances the offsets past the strings.
+VectorPtr deserializeStrings(
+    const TypePtr& type,
+    const std::vector<std::string_view>& data,
+    const BufferPtr& nulls,
+    std::vector<size_t>& offsets,
+    memory::MemoryPool* pool) {
+  const auto numRows = data.size();
+  auto flatVector =
+      BaseVector::create<FlatVector<StringView>>(type, numRows, pool);
+
+  auto* rawNulls = nulls->as<uint64_t>();
+
+  for (auto i = 0; i < numRows; ++i) {
+    if (bits::isBitNull(rawNulls, i)) {
+      flatVector->setNull(i, true);
+    } else {
+      offsets[i] +=
+          readString(data[i].data() + offsets[i], flatVector.get(), i);
+    }
+  }
+
+  return flatVector;
+}
+
+VectorPtr deserializeUnknownArrays(
+    const TypePtr& type,
+    const std::vector<std::string_view>& data,
+    const BufferPtr& sizes,
+    std::vector<size_t>& offsets,
+    memory::MemoryPool* pool) {
+  const auto numRows = data.size();
+  auto* rawSizes = sizes->as<vector_size_t>();
+  const auto total = totalSize(rawSizes, numRows);
+
+  return BaseVector::createNullConstant(UNKNOWN(), total, pool);
+}
+
+// Deserializes multiple strings from each 'row' in 'data'.
+// Each set of strings starts at data[row].data() + offsets[row] and contains
+// null flags followed by the strings. The number of strings is provided in
+// sizes[row].
+// nulls | size-of-s1 | <s1> | size-of-s2 | <s2> |...
+// Advances offsets past the last string.
+VectorPtr deserializeStringArrays(
+    const TypePtr& type,
+    const std::vector<std::string_view>& data,
+    const BufferPtr& sizes,
+    std::vector<size_t>& offsets,
+    memory::MemoryPool* pool) {
+  const auto numRows = data.size();
+  auto* rawSizes = sizes->as<vector_size_t>();
+
+  const auto total = totalSize(rawSizes, numRows);
+
+  auto flatVector =
+      BaseVector::create<FlatVector<StringView>>(type, total, pool);
+
+  vector_size_t index = 0;
+  for (auto i = 0; i < numRows; ++i) {
+    const auto size = rawSizes[i];
+    if (size > 0) {
+      auto nullBytes = bits::nbytes(size);
+
+      auto* rawElementNulls = readNulls(data[i].data() + offsets[i]);
+
+      offsets[i] += nullBytes;
+
+      for (auto j = 0; j < size; ++j) {
+        if (bits::isBitSet(rawElementNulls, j)) {
+          flatVector->setNull(index++, true);
+        } else {
+          offsets[i] +=
+              readString(data[i].data() + offsets[i], flatVector.get(), index);
+          ++index;
+        }
+      }
+    }
+  }
+
+  return flatVector;
+}
+
+VectorPtr deserialize(
+    const TypePtr& type,
+    const std::vector<std::string_view>& data,
+    const BufferPtr& nulls,
+    std::vector<size_t>& offsets,
+    memory::MemoryPool* pool);
+
+// Deserializes multiple arrays from each 'row' in 'data'.
+// Each set of arrays starts at data[row].data() + offsets[row] and contains
+// null flags followed by the arrays. The number of arrays is provided in
+// sizes[row].
+// nulls | serializes size | offset-of-a1 | offset-of-a2 |...
+// |size-of-a1 | nulls-of-a1-elements | <a1 elements> |...
+//
+// Advances offsets past the last array.
+VectorPtr deserializeComplexArrays(
+    const TypePtr& type,
+    const std::vector<std::string_view>& data,
+    const BufferPtr& sizes,
+    std::vector<size_t>& offsets,
+    memory::MemoryPool* pool) {
+  const auto numRows = data.size();
+  auto* rawSizes = sizes->as<vector_size_t>();
+
+  const auto total = totalSize(rawSizes, numRows);
+
+  BufferPtr nulls = allocateNulls(total, pool);
+  auto* rawNulls = nulls->asMutable<uint64_t>();
+
+  std::vector<std::string_view> nestedData;
+  nestedData.reserve(total);
+  std::vector<size_t> nestedOffsets;
+  nestedOffsets.reserve(total);
+
+  vector_size_t nestedIndex = 0;
+  for (auto i = 0; i < numRows; ++i) {
+    const auto size = rawSizes[i];
+    if (size > 0) {
+      // Read nulls.
+      auto* rawElementNulls = readNulls(data[i].data() + offsets[i]);
+      offsets[i] += bits::nbytes(size);
+
+      // Read serialized size.
+      auto serializedSize = readInt32(data[i].data() + offsets[i]);
+      offsets[i] += kSizeBytes;
+
+      // Read offsets of individual elements.
+      auto buffer = data[i].data() + offsets[i];
+      for (auto j = 0; j < size; ++j) {
+        if (bits::isBitSet(rawElementNulls, j)) {
+          bits::setNull(rawNulls, nestedIndex++);
+        } else {
+          int32_t nestedOffset = readInt32(buffer + j * kSizeBytes);
+          nestedOffsets.push_back(offsets[i] + nestedOffset);
+          nestedData.push_back(data[i]);
+          ++nestedIndex;
+        }
+      }
+
+      offsets[i] += serializedSize;
+    }
+  }
+
+  return deserialize(type, nestedData, nulls, nestedOffsets, pool);
+}
+
+// Deserializes one array from each 'row' in 'data'.
+// Each array starts at data[row].data() + offsets[row].
+// size | element nulls | serialized size (if complex type elements)
+// | element offsets (if complex type elements) | e1 | e2 | e3 |...
+//
+// Advances the offsets past the arrays.
+ArrayVectorPtr deserializeArrays(
+    const TypePtr& type,
+    const std::vector<std::string_view>& data,
+    const BufferPtr& nulls,
+    std::vector<size_t>& offsets,
+    memory::MemoryPool* pool) {
+  const auto numRows = data.size();
+
+  auto* rawNulls = nulls->as<uint64_t>();
+
+  BufferPtr arrayOffsets = allocateOffsets(numRows, pool);
+  auto* rawArrayOffsets = arrayOffsets->asMutable<vector_size_t>();
+
+  BufferPtr arraySizes = allocateSizes(numRows, pool);
+  auto* rawArraySizes = arraySizes->asMutable<vector_size_t>();
+
+  vector_size_t arrayOffset = 0;
+
+  for (auto i = 0; i < numRows; ++i) {
+    if (!bits::isBitNull(rawNulls, i)) {
+      // Read array size.
+      int32_t size = readInt32(data[i].data() + offsets[i]);
+      offsets[i] += kSizeBytes;
+
+      rawArrayOffsets[i] = arrayOffset;
+      rawArraySizes[i] = size;
+      arrayOffset += size;
+    }
+  }
+
+  VectorPtr elements;
+  const auto& elementType = type->childAt(0);
+  if (elementType->isUnKnown()) {
+    elements =
+        deserializeUnknownArrays(elementType, data, arraySizes, offsets, pool);
+  } else if (elementType->isFixedWidth()) {
+    elements = VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH(
+        deserializeFixedWidthArrays,
+        elementType->kind(),
+        elementType,
+        data,
+        arraySizes,
+        offsets,
+        pool);
+  } else {
+    switch (elementType->kind()) {
+      case TypeKind::VARCHAR:
+      case TypeKind::VARBINARY:
+        elements = deserializeStringArrays(
+            elementType, data, arraySizes, offsets, pool);
+        break;
+      case TypeKind::ARRAY:
+      case TypeKind::MAP:
+      case TypeKind::ROW:
+        elements = deserializeComplexArrays(
+            elementType, data, arraySizes, offsets, pool);
+        break;
+      default:
+        VELOX_UNREACHABLE("{}", elementType->toString());
+    }
+  }
+
+  return std::make_shared<ArrayVector>(
+      pool, type, nulls, numRows, arrayOffsets, arraySizes, elements);
+}
+
+// Deserializes one map from each 'row' in 'data'.
+// Each map starts at data[row].data() + offsets[row].
+// array-of-keys | array-of-values
+// Advances the offsets past the maps.
+VectorPtr deserializeMaps(
+    const TypePtr& type,
+    const std::vector<std::string_view>& data,
+    const BufferPtr& nulls,
+    std::vector<size_t>& offsets,
+    memory::MemoryPool* pool) {
+  auto arrayOfKeysType = ARRAY(type->childAt(0));
+  auto arrayOfValuesType = ARRAY(type->childAt(1));
+  auto arrayOfKeys =
+      deserializeArrays(arrayOfKeysType, data, nulls, offsets, pool);
+  auto arrayOfValues =
+      deserializeArrays(arrayOfValuesType, data, nulls, offsets, pool);
+
+  return std::make_shared<MapVector>(
+      pool,
+      type,
+      nulls,
+      data.size(),
+      arrayOfKeys->offsets(),
+      arrayOfKeys->sizes(),
+      arrayOfKeys->elements(),
+      arrayOfValues->elements());
+}
+
+RowVectorPtr deserializeRows(
+    const TypePtr& type,
+    const std::vector<std::string_view>& data,
+    const BufferPtr& nulls,
+    std::vector<size_t>& offsets,
+    memory::MemoryPool* pool);
+
+// Switches on 'type' and calls type-specific deserialize method to deserialize
+// one value from each 'row' in 'data' starting at the specified offset.
+// Each value starts at data[row].data() + offsets[row].
+VectorPtr deserialize(
+    const TypePtr& type,
+    const std::vector<std::string_view>& data,
+    const BufferPtr& nulls,
+    std::vector<size_t>& offsets,
+    memory::MemoryPool* pool) {
+  const auto typeKind = type->kind();
+
+  if (typeKind == TypeKind::UNKNOWN) {
+    return deserializeUnknowns(type, data, nulls, offsets, pool);
+  }
+
+  if (type->isFixedWidth()) {
+    return VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH(
+        deserializeFixedWidth, typeKind, type, data, nulls, offsets, pool);
+  }
+  switch (typeKind) {
+    case TypeKind::VARCHAR:
+    case TypeKind::VARBINARY:
+      return deserializeStrings(type, data, nulls, offsets, pool);
+      break;
+    case TypeKind::ARRAY:
+      return deserializeArrays(type, data, nulls, offsets, pool);
+      break;
+    case TypeKind::MAP:
+      return deserializeMaps(type, data, nulls, offsets, pool);
+      break;
+    case TypeKind::ROW:
+      return deserializeRows(type, data, nulls, offsets, pool);
+      break;
+    default:
+      VELOX_UNREACHABLE("{}", type->toString());
+  }
+}
+
+// Deserializes one struct from each 'row' in 'data'.
+// nulls | field1 | field2 |...
+RowVectorPtr deserializeRows(
+    const TypePtr& type,
+    const std::vector<std::string_view>& data,
+    const BufferPtr& nulls,
+    std::vector<size_t>& offsets,
+    memory::MemoryPool* pool) {
+  const auto numRows = data.size();
+  const size_t numFields = type->size();
+
+  std::vector<VectorPtr> fields;
+
+  auto* rawNulls = nulls != nullptr ? nulls->as<uint64_t>() : nullptr;
+
+  std::vector<BufferPtr> fieldNulls;
+  fieldNulls.reserve(numFields);
+  for (auto i = 0; i < numFields; ++i) {
+    fieldNulls.emplace_back(allocateNulls(numRows, pool));
+    auto* rawFieldNulls = fieldNulls.back()->asMutable<uint8_t>();
+    for (auto row = 0; row < numRows; ++row) {
+      auto* serializedNulls = readNulls(data[row].data() + offsets[row]);
+      const auto isNull =
+          (rawNulls != nullptr && bits::isBitNull(rawNulls, row)) ||
+          bits::isBitSet(serializedNulls, i);
+      bits::setBit(rawFieldNulls, row, !isNull);
+    }
+  }
+
+  const size_t nullLength = bits::nbytes(numFields);
+  for (auto row = 0; row < numRows; ++row) {
+    offsets[row] += nullLength;
+  }
+
+  for (auto i = 0; i < numFields; ++i) {
+    auto field =
+        deserialize(type->childAt(i), data, fieldNulls[i], offsets, pool);
+    fields.emplace_back(std::move(field));
+  }
+
+  return std::make_shared<RowVector>(
+      pool, type, nulls, numRows, std::move(fields));
+}
+
+} // namespace
+
+// static
+RowVectorPtr CompactRow::deserialize(
+    const std::vector<std::string_view>& data,
+    const RowTypePtr& rowType,
+    memory::MemoryPool* pool) {
+  const auto numRows = data.size();
+  std::vector<size_t> offsets(numRows, 0);
+
+  return deserializeRows(rowType, data, nullptr, offsets, pool);
+}
+
+} // namespace facebook::velox::row
diff --git a/velox/row/CompactRow.h b/velox/row/CompactRow.h
new file mode 100644
index 000000000000..9abaed0bdee2
--- /dev/null
+++ b/velox/row/CompactRow.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "velox/vector/ComplexVector.h"
+#include "velox/vector/DecodedVector.h"
+
+namespace facebook::velox::row {
+
+class CompactRow {
+ public:
+  explicit CompactRow(const RowVectorPtr& vector);
+
+  /// Returns row size if all fields are fixed width. Return std::nullopt if
+  /// there are variable-width fields.
+  static std::optional<int32_t> fixedRowSize(const RowTypePtr& rowType);
+
+  /// Returns serialized size of the row at specified index. Use only if
+  /// 'fixedRowSize' returned std::nullopt.
+  int32_t rowSize(vector_size_t index);
+
+  /// Serializes row at specified index into 'buffer'.
+  /// 'buffer' must have sufficient capacity and set to all zeros.
+  int32_t serialize(vector_size_t index, char* buffer);
+
+  /// Deserializes multiple rows into a RowVector of specified type. The type
+  /// must match the contents of the serialized rows.
+  static RowVectorPtr deserialize(
+      const std::vector<std::string_view>& data,
+      const RowTypePtr& rowType,
+      memory::MemoryPool* pool);
+
+ private:
+  explicit CompactRow(const VectorPtr& vector);
+
+  void initialize(const TypePtr& type);
+
+  bool isNullAt(vector_size_t);
+
+  /// Fixed-width types only. Returns number of bytes used by single value.
+  int32_t valueBytes() const {
+    return valueBytes_;
+  }
+
+  /// Writes fixed-width value at specified index into 'buffer'. Value must not
+  /// be null.
+  void serializeFixedWidth(vector_size_t index, char* buffer);
+
+  /// Writes range of fixed-width values between 'offset' and 'offset + size'
+  /// into 'buffer'. Values can be null.
+  void
+  serializeFixedWidth(vector_size_t offset, vector_size_t size, char* buffer);
+
+  /// Returns serialized size of variable-width row.
+  int32_t variableWidthRowSize(vector_size_t index);
+
+  /// Writes variable-width value at specified index into 'buffer'. Value must
+  /// not be null. Returns number of bytes written to 'buffer'.
+  int32_t serializeVariableWidth(vector_size_t index, char* buffer);
+
+ private:
+  /// Returns serialized size of array row.
+  int32_t arrayRowSize(vector_size_t index);
+
+  /// Serializes array value to buffer. Value must not be null. Returns number
+  /// of bytes written to 'buffer'.
+  int32_t serializeArray(vector_size_t index, char* buffer);
+
+  /// Returns serialized size of map row.
+  int32_t mapRowSize(vector_size_t index);
+
+  /// Serializes map value to buffer. Value must not be null. Returns number of
+  /// bytes written to 'buffer'.
+  int32_t serializeMap(vector_size_t index, char* buffer);
+
+  /// Returns serialized size of a range of values.
+  int32_t arrayRowSize(
+      CompactRow& elements,
+      vector_size_t offset,
+      vector_size_t size,
+      bool fixedWidth);
+
+  /// Serializes a range of values into buffer. Returns number of bytes written
+  /// to 'buffer'.
+  int32_t serializeAsArray(
+      CompactRow& elements,
+      vector_size_t offset,
+      vector_size_t size,
+      bool fixedWidth,
+      char* buffer);
+
+  /// Returns serialized size of struct value.
+  int32_t rowRowSize(vector_size_t index);
+
+  /// Serializes struct value to buffer. Value must not be null.
+  int32_t serializeRow(vector_size_t index, char* buffer);
+
+  const TypeKind typeKind_;
+  DecodedVector decoded_;
+
+  /// True if values of 'typeKind_' have fixed width.
+  bool fixedWidthTypeKind_{false};
+
+  /// ARRAY, MAP and ROW types only.
+  std::vector<CompactRow> children_;
+  std::vector<bool> childIsFixedWidth_;
+
+  /// True if this is a flat fixed-width vector whose consecutive values can be
+  /// copied into serialized buffer in bulk.
+  bool supportsBulkCopy_{false};
+
+  // ROW type only. Number of bytes used by null flags.
+  size_t rowNullBytes_;
+
+  // Fixed-width types only. Number of bytes used for a single value.
+  size_t valueBytes_;
+};
+} // namespace facebook::velox::row
diff --git a/velox/row/benchmark/UnsafeRowSerializeBenchmark.cpp b/velox/row/benchmark/UnsafeRowSerializeBenchmark.cpp
index a5f096efa6cd..f751f201d93b 100644
--- a/velox/row/benchmark/UnsafeRowSerializeBenchmark.cpp
+++ b/velox/row/benchmark/UnsafeRowSerializeBenchmark.cpp
@@ -18,6 +18,8 @@
 
 #include "velox/common/memory/HashStringAllocator.h"
 #include "velox/exec/ContainerRowSerde.h"
+#include "velox/row/CompactRow.h"
+#include "velox/row/UnsafeRowDeserializers.h"
 #include "velox/row/UnsafeRowFast.h"
 #include "velox/vector/fuzzer/VectorFuzzer.h"
 
@@ -26,6 +28,87 @@ namespace {
 
 class SerializeBenchmark {
  public:
+  void serializeUnsafe(const RowTypePtr& rowType) {
+    folly::BenchmarkSuspender suspender;
+    auto data = makeData(rowType);
+    suspender.dismiss();
+
+    UnsafeRowFast fast(data);
+    auto totalSize = computeTotalSize(fast, rowType, data->size());
+    auto buffer = AlignedBuffer::allocate<char>(totalSize, pool());
+    auto serialized = serialize(fast, data->size(), buffer);
+    VELOX_CHECK_EQ(serialized.size(), data->size());
+  }
+
+  void deserializeUnsafe(const RowTypePtr& rowType) {
+    folly::BenchmarkSuspender suspender;
+    auto data = makeData(rowType);
+    UnsafeRowFast fast(data);
+    auto totalSize = computeTotalSize(fast, rowType, data->size());
+    auto buffer = AlignedBuffer::allocate<char>(totalSize, pool());
+    auto serialized = serialize(fast, data->size(), buffer);
+    suspender.dismiss();
+
+    auto copy = UnsafeRowDeserializer::deserialize(serialized, rowType, pool());
+    VELOX_CHECK_EQ(copy->size(), data->size());
+  }
+
+  void serializeCompact(const RowTypePtr& rowType) {
+    folly::BenchmarkSuspender suspender;
+    auto data = makeData(rowType);
+    suspender.dismiss();
+
+    CompactRow compact(data);
+    auto totalSize = computeTotalSize(compact, rowType, data->size());
+    auto buffer = AlignedBuffer::allocate<char>(totalSize, pool());
+    auto serialized = serialize(compact, data->size(), buffer);
+    VELOX_CHECK_EQ(serialized.size(), data->size());
+  }
+
+  void deserializeCompact(const RowTypePtr& rowType) {
+    folly::BenchmarkSuspender suspender;
+    auto data = makeData(rowType);
+    CompactRow compact(data);
+    auto totalSize = computeTotalSize(compact, rowType, data->size());
+    auto buffer = AlignedBuffer::allocate<char>(totalSize, pool());
+    auto serialized = serialize(compact, data->size(), buffer);
+    suspender.dismiss();
+
+    auto copy = CompactRow::deserialize(serialized, rowType, pool());
+    VELOX_CHECK_EQ(copy->size(), data->size());
+  }
+
+  void serializeContainer(const RowTypePtr& rowType) {
+    folly::BenchmarkSuspender suspender;
+    auto data = makeData(rowType);
+    suspender.dismiss();
+
+    HashStringAllocator allocator(pool());
+    auto position = serialize(data, allocator);
+    VELOX_CHECK_NOT_NULL(position.header);
+  }
+
+  void deserializeContainer(const RowTypePtr& rowType) {
+    folly::BenchmarkSuspender suspender;
+    auto data = makeData(rowType);
+
+    HashStringAllocator allocator(pool());
+    auto position = serialize(data, allocator);
+    VELOX_CHECK_NOT_NULL(position.header);
+    suspender.dismiss();
+
+    auto copy = BaseVector::create(rowType, data->size(), pool());
+
+    ByteStream in;
+    HashStringAllocator::prepareRead(position.header, in);
+    for (auto i = 0; i < data->size(); ++i) {
+      exec::ContainerRowSerde::deserialize(in, i, copy.get());
+    }
+
+    VELOX_CHECK_EQ(copy->size(), data->size());
+  }
+
+ private:
   RowVectorPtr makeData(const RowTypePtr& rowType) {
     VectorFuzzer::Options options;
     options.vectorSize = 1'000;
@@ -36,52 +119,84 @@ class SerializeBenchmark {
     return fuzzer.fuzzInputRow(rowType);
   }
 
-  void runUnsafe(const RowTypePtr& rowType) {
-    folly::BenchmarkSuspender suspender;
-    auto data = makeData(rowType);
-    suspender.dismiss();
-
-    UnsafeRowFast fast(data);
-
+  size_t computeTotalSize(
+      UnsafeRowFast& unsafeRow,
+      const RowTypePtr& rowType,
+      vector_size_t numRows) {
     size_t totalSize = 0;
     if (auto fixedRowSize = UnsafeRowFast::fixedRowSize(rowType)) {
-      totalSize += fixedRowSize.value() * data->size();
+      totalSize += fixedRowSize.value() * numRows;
     } else {
-      for (auto i = 0; i < data->size(); ++i) {
-        auto rowSize = fast.rowSize(i);
+      for (auto i = 0; i < numRows; ++i) {
+        auto rowSize = unsafeRow.rowSize(i);
         totalSize += rowSize;
       }
     }
+    return totalSize;
+  }
 
-    auto buffer = AlignedBuffer::allocate<char>(totalSize, pool());
+  std::vector<std::optional<std::string_view>> serialize(
+      UnsafeRowFast& unsafeRow,
+      vector_size_t numRows,
+      BufferPtr& buffer) {
+    std::vector<std::optional<std::string_view>> serialized;
     auto rawBuffer = buffer->asMutable<char>();
 
     size_t offset = 0;
-    for (auto i = 0; i < data->size(); ++i) {
-      auto rowSize = fast.serialize(i, rawBuffer + offset);
+    for (auto i = 0; i < numRows; ++i) {
+      auto rowSize = unsafeRow.serialize(i, rawBuffer + offset);
+      serialized.push_back(std::string_view(rawBuffer + offset, rowSize));
       offset += rowSize;
     }
 
-    VELOX_CHECK_EQ(totalSize, offset);
+    VELOX_CHECK_EQ(buffer->size(), offset);
+    return serialized;
   }
 
-  void runContainer(const RowTypePtr& rowType) {
-    folly::BenchmarkSuspender suspender;
-    auto data = makeData(rowType);
-    suspender.dismiss();
+  size_t computeTotalSize(
+      CompactRow& compactRow,
+      const RowTypePtr& rowType,
+      vector_size_t numRows) {
+    size_t totalSize = 0;
+    if (auto fixedRowSize = CompactRow::fixedRowSize(rowType)) {
+      totalSize += fixedRowSize.value() * numRows;
+    } else {
+      for (auto i = 0; i < numRows; ++i) {
+        auto rowSize = compactRow.rowSize(i);
+        totalSize += rowSize;
+      }
+    }
+    return totalSize;
+  }
 
-    HashStringAllocator allocator(pool());
+  std::vector<std::string_view>
+  serialize(CompactRow& compactRow, vector_size_t numRows, BufferPtr& buffer) {
+    std::vector<std::string_view> serialized;
+    auto rawBuffer = buffer->asMutable<char>();
+
+    size_t offset = 0;
+    for (auto i = 0; i < numRows; ++i) {
+      auto rowSize = compactRow.serialize(i, rawBuffer + offset);
+      serialized.push_back(std::string_view(rawBuffer + offset, rowSize));
+      offset += rowSize;
+    }
+
+    VELOX_CHECK_EQ(buffer->size(), offset);
+    return serialized;
+  }
+
+  HashStringAllocator::Position serialize(
+      const RowVectorPtr& data,
+      HashStringAllocator& allocator) {
     ByteStream out(&allocator);
     auto position = allocator.newWrite(out);
     for (auto i = 0; i < data->size(); ++i) {
       exec::ContainerRowSerde::serialize(*data, i, out);
     }
     allocator.finishWrite(out, 0);
-
-    VELOX_CHECK_GT(out.size(), 0);
+    return position;
   }
 
- private:
   memory::MemoryPool* pool() {
     return pool_.get();
   }
@@ -89,142 +204,86 @@ class SerializeBenchmark {
   std::shared_ptr<memory::MemoryPool> pool_{memory::addDefaultLeafMemoryPool()};
 };
 
-BENCHMARK(unsafe_fixedWidth5) {
-  SerializeBenchmark benchmark;
-  benchmark.runUnsafe(ROW({BIGINT(), DOUBLE(), BOOLEAN(), TINYINT(), REAL()}));
-}
-
-BENCHMARK_RELATIVE(container_fixedWidth5) {
-  SerializeBenchmark benchmark;
-  benchmark.runContainer(
-      ROW({BIGINT(), DOUBLE(), BOOLEAN(), TINYINT(), REAL()}));
-}
-
-BENCHMARK(unsafe_fixedWidth10) {
-  SerializeBenchmark benchmark;
-  benchmark.runUnsafe(ROW({
-      BIGINT(),
-      BIGINT(),
-      BIGINT(),
-      BIGINT(),
-      BIGINT(),
-      BIGINT(),
-      DOUBLE(),
-      BIGINT(),
-      BIGINT(),
-      BIGINT(),
-  }));
-}
-
-BENCHMARK_RELATIVE(container_fixedWidth10) {
-  SerializeBenchmark benchmark;
-  benchmark.runContainer(ROW({
-      BIGINT(),
-      BIGINT(),
-      BIGINT(),
-      BIGINT(),
-      BIGINT(),
-      BIGINT(),
-      DOUBLE(),
-      BIGINT(),
-      BIGINT(),
-      BIGINT(),
-  }));
-}
-
-BENCHMARK(unsafe_fixedWidth20) {
-  SerializeBenchmark benchmark;
-  benchmark.runUnsafe(ROW({
-      BIGINT(), BIGINT(), BIGINT(), BIGINT(), BIGINT(), BIGINT(), BIGINT(),
-      BIGINT(), BIGINT(), BIGINT(), DOUBLE(), DOUBLE(), DOUBLE(), DOUBLE(),
-      DOUBLE(), DOUBLE(), DOUBLE(), DOUBLE(), BIGINT(), BIGINT(),
-  }));
-}
-
-BENCHMARK_RELATIVE(container_fixedWidth20) {
-  SerializeBenchmark benchmark;
-  benchmark.runContainer(ROW({
-      BIGINT(), BIGINT(), BIGINT(), BIGINT(), BIGINT(), BIGINT(), BIGINT(),
-      BIGINT(), BIGINT(), BIGINT(), DOUBLE(), DOUBLE(), DOUBLE(), DOUBLE(),
-      DOUBLE(), DOUBLE(), DOUBLE(), DOUBLE(), BIGINT(), BIGINT(),
-  }));
-}
-
-BENCHMARK(unsafe_strings1) {
-  SerializeBenchmark benchmark;
-  benchmark.runUnsafe(ROW({BIGINT(), VARCHAR()}));
-}
-
-BENCHMARK_RELATIVE(container_strings1) {
-  SerializeBenchmark benchmark;
-  benchmark.runContainer(ROW({BIGINT(), VARCHAR()}));
-}
-
-BENCHMARK(unsafe_strings5) {
-  SerializeBenchmark benchmark;
-  benchmark.runUnsafe(ROW({
-      BIGINT(),
-      VARCHAR(),
-      VARCHAR(),
-      VARCHAR(),
-      VARCHAR(),
-      VARCHAR(),
-  }));
-}
-
-BENCHMARK_RELATIVE(container_strings5) {
-  SerializeBenchmark benchmark;
-  benchmark.runContainer(ROW({
-      BIGINT(),
-      VARCHAR(),
-      VARCHAR(),
-      VARCHAR(),
-      VARCHAR(),
-      VARCHAR(),
-  }));
-}
-
-BENCHMARK(unsafe_arrays) {
-  SerializeBenchmark benchmark;
-  benchmark.runUnsafe(ROW({BIGINT(), ARRAY(BIGINT())}));
-}
-
-BENCHMARK_RELATIVE(container_arrays) {
-  SerializeBenchmark benchmark;
-  benchmark.runContainer(ROW({BIGINT(), ARRAY(BIGINT())}));
-}
-
-BENCHMARK(unsafe_nestedArrays) {
-  SerializeBenchmark benchmark;
-  benchmark.runUnsafe(ROW({BIGINT(), ARRAY(ARRAY(BIGINT()))}));
-}
-
-BENCHMARK_RELATIVE(container_nestedArrays) {
-  SerializeBenchmark benchmark;
-  benchmark.runContainer(ROW({BIGINT(), ARRAY(ARRAY(BIGINT()))}));
-}
-
-BENCHMARK(unsafe_maps) {
-  SerializeBenchmark benchmark;
-  benchmark.runUnsafe(ROW({BIGINT(), MAP(BIGINT(), REAL())}));
-}
-
-BENCHMARK_RELATIVE(container_maps) {
-  SerializeBenchmark benchmark;
-  benchmark.runContainer(ROW({BIGINT(), MAP(BIGINT(), REAL())}));
-}
-
-BENCHMARK(unsafe_structs) {
-  SerializeBenchmark benchmark;
-  benchmark.runUnsafe(
-      ROW({BIGINT(), ROW({BIGINT(), DOUBLE(), BOOLEAN(), TINYINT(), REAL()})}));
-}
+#define SERDE_BENCHMARKS(name, rowType)      \
+  BENCHMARK(unsafe_serialize_##name) {       \
+    SerializeBenchmark benchmark;            \
+    benchmark.serializeUnsafe(rowType);      \
+  }                                          \
+                                             \
+  BENCHMARK(compact_serialize_##name) {      \
+    SerializeBenchmark benchmark;            \
+    benchmark.serializeCompact(rowType);     \
+  }                                          \
+                                             \
+  BENCHMARK(container_serialize_##name) {    \
+    SerializeBenchmark benchmark;            \
+    benchmark.serializeContainer(rowType);   \
+  }                                          \
+                                             \
+  BENCHMARK(unsafe_deserialize_##name) {     \
+    SerializeBenchmark benchmark;            \
+    benchmark.deserializeUnsafe(rowType);    \
+  }                                          \
+                                             \
+  BENCHMARK(compact_deserialize_##name) {    \
+    SerializeBenchmark benchmark;            \
+    benchmark.deserializeCompact(rowType);   \
+  }                                          \
+                                             \
+  BENCHMARK(container_deserialize_##name) {  \
+    SerializeBenchmark benchmark;            \
+    benchmark.deserializeContainer(rowType); \
+  }
 
-BENCHMARK_RELATIVE(container_structs) {
-  SerializeBenchmark benchmark;
-  benchmark.runContainer(
-      ROW({BIGINT(), ROW({BIGINT(), DOUBLE(), BOOLEAN(), TINYINT(), REAL()})}));
-}
+SERDE_BENCHMARKS(
+    fixedWidth5,
+    ROW({BIGINT(), DOUBLE(), BOOLEAN(), TINYINT(), REAL()}));
+
+SERDE_BENCHMARKS(
+    fixedWidth10,
+    ROW({
+        BIGINT(),
+        BIGINT(),
+        BIGINT(),
+        BIGINT(),
+        BIGINT(),
+        BIGINT(),
+        DOUBLE(),
+        BIGINT(),
+        BIGINT(),
+        BIGINT(),
+    }));
+
+SERDE_BENCHMARKS(
+    fixedWidth20,
+    ROW({
+        BIGINT(), BIGINT(), BIGINT(), BIGINT(), BIGINT(), BIGINT(), BIGINT(),
+        BIGINT(), BIGINT(), BIGINT(), DOUBLE(), DOUBLE(), DOUBLE(), DOUBLE(),
+        DOUBLE(), DOUBLE(), DOUBLE(), DOUBLE(), BIGINT(), BIGINT(),
+    }));
+
+SERDE_BENCHMARKS(strings1, ROW({BIGINT(), VARCHAR()}));
+
+SERDE_BENCHMARKS(
+    strings5,
+    ROW({
+        BIGINT(),
+        VARCHAR(),
+        VARCHAR(),
+        VARCHAR(),
+        VARCHAR(),
+        VARCHAR(),
+    }));
+
+SERDE_BENCHMARKS(arrays, ROW({BIGINT(), ARRAY(BIGINT())}));
+
+SERDE_BENCHMARKS(nestedArrays, ROW({BIGINT(), ARRAY(ARRAY(BIGINT()))}));
+
+SERDE_BENCHMARKS(maps, ROW({BIGINT(), MAP(BIGINT(), REAL())}));
+
+SERDE_BENCHMARKS(
+    structs,
+    ROW({BIGINT(), ROW({BIGINT(), DOUBLE(), BOOLEAN(), TINYINT(), REAL()})}));
 
 } // namespace
 } // namespace facebook::velox::row
diff --git a/velox/row/tests/CMakeLists.txt b/velox/row/tests/CMakeLists.txt
index 0f6792fb5a87..271cd1fb88cc 100644
--- a/velox/row/tests/CMakeLists.txt
+++ b/velox/row/tests/CMakeLists.txt
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_executable(velox_row_test UnsafeRowFuzzTest.cpp)
+add_executable(velox_row_test UnsafeRowFuzzTest.cpp CompactRowTest.cpp)
 
 add_test(velox_row_test velox_row_test)
 
diff --git a/velox/row/tests/CompactRowTest.cpp b/velox/row/tests/CompactRowTest.cpp
new file mode 100644
index 000000000000..46417d06ee8f
--- /dev/null
+++ b/velox/row/tests/CompactRowTest.cpp
@@ -0,0 +1,518 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "velox/row/CompactRow.h"
+#include "velox/vector/fuzzer/VectorFuzzer.h"
+#include "velox/vector/tests/utils/VectorTestBase.h"
+
+using namespace facebook::velox::test;
+
+namespace facebook::velox::row {
+namespace {
+
+class CompactRowTest : public ::testing::Test, public VectorTestBase {
+ protected:
+  void testRoundTrip(const RowVectorPtr& data) {
+    SCOPED_TRACE(data->toString());
+
+    auto rowType = asRowType(data->type());
+    auto numRows = data->size();
+
+    CompactRow row(data);
+
+    size_t totalSize = 0;
+    if (auto fixedRowSize = CompactRow::fixedRowSize(rowType)) {
+      totalSize = fixedRowSize.value() * numRows;
+    } else {
+      for (auto i = 0; i < numRows; ++i) {
+        totalSize += row.rowSize(i);
+      }
+    }
+
+    std::vector<std::string_view> serialized;
+
+    BufferPtr buffer = AlignedBuffer::allocate<char>(totalSize, pool(), 0);
+    auto* rawBuffer = buffer->asMutable<char>();
+    size_t offset = 0;
+    for (auto i = 0; i < numRows; ++i) {
+      auto size = row.serialize(i, rawBuffer + offset);
+      serialized.push_back(std::string_view(rawBuffer + offset, size));
+      offset += size;
+
+      VELOX_CHECK_EQ(size, row.rowSize(i), "Row {}: {}", i, data->toString(i));
+    }
+
+    VELOX_CHECK_EQ(offset, totalSize);
+
+    auto copy = CompactRow::deserialize(serialized, rowType, pool());
+    assertEqualVectors(data, copy);
+  }
+};
+
+TEST_F(CompactRowTest, fixedRowSize) {
+  ASSERT_EQ(1 + 1, CompactRow::fixedRowSize(ROW({BOOLEAN()})));
+  ASSERT_EQ(1 + 8, CompactRow::fixedRowSize(ROW({BIGINT()})));
+  ASSERT_EQ(1 + 4, CompactRow::fixedRowSize(ROW({INTEGER()})));
+  ASSERT_EQ(1 + 2, CompactRow::fixedRowSize(ROW({SMALLINT()})));
+  ASSERT_EQ(1 + 8, CompactRow::fixedRowSize(ROW({DOUBLE()})));
+  ASSERT_EQ(std::nullopt, CompactRow::fixedRowSize(ROW({VARCHAR()})));
+  ASSERT_EQ(std::nullopt, CompactRow::fixedRowSize(ROW({ARRAY(BIGINT())})));
+  ASSERT_EQ(
+      1 + 1 + 8 + 4 + 2 + 8,
+      CompactRow::fixedRowSize(
+          ROW({BOOLEAN(), BIGINT(), INTEGER(), SMALLINT(), DOUBLE()})));
+
+  ASSERT_EQ(std::nullopt, CompactRow::fixedRowSize(ROW({BIGINT(), VARCHAR()})));
+  ASSERT_EQ(
+      std::nullopt,
+      CompactRow::fixedRowSize(ROW({BIGINT(), ROW({VARCHAR()})})));
+
+  ASSERT_EQ(1, CompactRow::fixedRowSize(ROW({UNKNOWN()})));
+}
+
+TEST_F(CompactRowTest, rowSizeString) {
+  auto data = makeRowVector({
+      makeFlatVector<std::string>({"a", "abc", "Longer string", "d", ""}),
+  });
+
+  CompactRow row(data);
+
+  // 1 byte for null flags. 4 bytes for string size. N bytes for the string
+  // itself.
+  ASSERT_EQ(1 + 4 + 1, row.rowSize(0));
+  ASSERT_EQ(1 + 4 + 3, row.rowSize(1));
+  ASSERT_EQ(1 + 4 + 13, row.rowSize(2));
+  ASSERT_EQ(1 + 4 + 1, row.rowSize(3));
+  ASSERT_EQ(1 + 4 + 0, row.rowSize(4));
+}
+
+TEST_F(CompactRowTest, rowSizeArrayOfBigint) {
+  auto data = makeRowVector({
+      makeArrayVector<int64_t>({
+          {1, 2, 3},
+          {4, 5},
+          {},
+          {6},
+      }),
+  });
+
+  {
+    CompactRow row(data);
+
+    // 1 byte for null flags. 4 bytes for array
+    // size. 1 byte for null flags for elements. N bytes for array elements.
+    ASSERT_EQ(1 + 4 + 1 + 8 * 3, row.rowSize(0));
+    ASSERT_EQ(1 + 4 + 1 + 8 * 2, row.rowSize(1));
+    ASSERT_EQ(1 + 4, row.rowSize(2));
+    ASSERT_EQ(1 + 4 + 1 + 8, row.rowSize(3));
+  }
+
+  data = makeRowVector({
+      makeNullableArrayVector<int64_t>({
+          {{1, 2, std::nullopt, 3}},
+          {{4, 5}},
+          {{}},
+          std::nullopt,
+          {{6}},
+      }),
+  });
+
+  {
+    CompactRow row(data);
+
+    // 1 byte for null flags. 4 bytes for array
+    // size. 1 byte for null flags for elements. N bytes for array elements.
+    ASSERT_EQ(1 + 4 + 1 + 8 * 4, row.rowSize(0));
+    ASSERT_EQ(1 + 4 + 1 + 8 * 2, row.rowSize(1));
+    ASSERT_EQ(1 + 4, row.rowSize(2));
+    ASSERT_EQ(1, row.rowSize(3));
+    ASSERT_EQ(1 + 4 + 1 + 8, row.rowSize(4));
+  }
+}
+
+TEST_F(CompactRowTest, rowSizeMixed) {
+  auto data = makeRowVector({
+      makeNullableFlatVector<int64_t>({1, 2, 3, std::nullopt}),
+      makeNullableFlatVector<std::string>({"a", "abc", "", std::nullopt}),
+  });
+
+  CompactRow row(data);
+
+  // 1 byte for null flags. 8 bytes for bigint field. 4 bytes for string size.
+  // N bytes for the string itself.
+  ASSERT_EQ(1 + 8 + (4 + 1), row.rowSize(0));
+  ASSERT_EQ(1 + 8 + (4 + 3), row.rowSize(1));
+  ASSERT_EQ(1 + 8 + (4 + 0), row.rowSize(2));
+  ASSERT_EQ(1 + 8, row.rowSize(3));
+}
+
+TEST_F(CompactRowTest, rowSizeArrayOfStrings) {
+  auto data = makeRowVector({
+      makeArrayVector<std::string>({
+          {"a", "Abc"},
+          {},
+          {"a", "Longer string", "abc"},
+      }),
+  });
+
+  {
+    CompactRow row(data);
+
+    // 1 byte for null flags. 4 bytes for array
+    // size. 1 byte for nulls flags for elements. 4 bytes for serialized size. 4
+    // bytes per offset of an element. N bytes for elements. Each string element
+    // is 4 bytes for size + string length.
+    ASSERT_EQ(1 + 4 + 1 + (4 + 1) + (4 + 3), row.rowSize(0));
+    ASSERT_EQ(1 + 4, row.rowSize(1));
+    ASSERT_EQ(1 + 4 + 1 + (4 + 1) + (4 + 13) + (4 + 3), row.rowSize(2));
+  }
+
+  data = makeRowVector({
+      makeNullableArrayVector<std::string>({
+          {{"a", "Abc", std::nullopt}},
+          {{}},
+          std::nullopt,
+          {{"a", std::nullopt, "Longer string", "abc"}},
+      }),
+  });
+
+  {
+    CompactRow row(data);
+
+    // Null strings do not take space.
+    ASSERT_EQ(1 + 4 + 1 + (4 + 1) + (4 + 3) + 0, row.rowSize(0));
+    ASSERT_EQ(1 + 4, row.rowSize(1));
+    ASSERT_EQ(1, row.rowSize(2));
+    ASSERT_EQ(1 + 4 + 1 + (4 + 1) + 0 + (4 + 13) + (4 + 3), row.rowSize(3));
+  }
+}
+
+TEST_F(CompactRowTest, boolean) {
+  auto data = makeRowVector({
+      makeFlatVector<bool>(
+          {true, false, true, true, false, false, true, false}),
+  });
+
+  testRoundTrip(data);
+
+  data = makeRowVector({
+      makeNullableFlatVector<bool>({
+          true,
+          false,
+          std::nullopt,
+          true,
+          std::nullopt,
+          false,
+          true,
+          false,
+      }),
+  });
+
+  testRoundTrip(data);
+}
+
+TEST_F(CompactRowTest, bigint) {
+  auto data = makeRowVector({
+      makeFlatVector<int64_t>({1, 2, 3, 4, 5}),
+  });
+
+  testRoundTrip(data);
+
+  data = makeRowVector({
+      makeNullableFlatVector<int64_t>(
+          {1, std::nullopt, 3, std::nullopt, 5, std::nullopt}),
+  });
+
+  testRoundTrip(data);
+}
+
+TEST_F(CompactRowTest, hugeint) {
+  auto data = makeRowVector({
+      makeFlatVector<int128_t>({1, 2, 3, 4, 5}),
+  });
+
+  testRoundTrip(data);
+
+  data = makeRowVector({
+      makeNullableFlatVector<int128_t>(
+          {std::nullopt, 1, 2, std::nullopt, std::nullopt, 3, 4, 5}),
+  });
+
+  testRoundTrip(data);
+}
+
+Timestamp ts(int64_t micros) {
+  return Timestamp::fromMicros(micros);
+}
+
+TEST_F(CompactRowTest, timestamp) {
+  auto data = makeRowVector({
+      makeFlatVector<Timestamp>({
+          ts(0),
+          ts(1),
+          ts(2),
+      }),
+  });
+
+  testRoundTrip(data);
+
+  // Serialize null Timestamp values with null flags set over a large
+  // non-serializable value (e.g. a value that triggers an exception in
+  // Timestamp::toMicros()).
+  data = makeRowVector({
+      makeFlatVector<Timestamp>({
+          ts(0),
+          Timestamp::max(),
+          ts(123'456),
+          Timestamp::min(),
+      }),
+  });
+
+  data->childAt(0)->setNull(1, true);
+  data->childAt(0)->setNull(3, true);
+
+  testRoundTrip(data);
+}
+
+TEST_F(CompactRowTest, string) {
+  auto data = makeRowVector({
+      makeFlatVector<std::string>({"a", "Abc", "", "Longer test string"}),
+  });
+
+  testRoundTrip(data);
+}
+
+TEST_F(CompactRowTest, unknown) {
+  auto data = makeRowVector({
+      makeAllNullFlatVector<UnknownValue>(10),
+  });
+
+  testRoundTrip(data);
+
+  data = makeRowVector({
+      makeArrayVector({0, 3, 5, 9}, makeAllNullFlatVector<UnknownValue>(10)),
+  });
+
+  testRoundTrip(data);
+}
+
+TEST_F(CompactRowTest, mix) {
+  auto data = makeRowVector({
+      makeFlatVector<std::string>({"a", "Abc", "", "Longer test string"}),
+      makeAllNullFlatVector<UnknownValue>(4),
+      makeFlatVector<int64_t>({1, 2, 3, 4}),
+  });
+
+  testRoundTrip(data);
+}
+
+TEST_F(CompactRowTest, arrayOfBigint) {
+  auto data = makeRowVector({
+      makeArrayVector<int64_t>({
+          {1, 2, 3},
+          {4, 5},
+          {6},
+          {},
+      }),
+  });
+
+  testRoundTrip(data);
+
+  data = makeRowVector({
+      makeNullableArrayVector<int64_t>({
+          {{1, 2, std::nullopt, 3}},
+          {{4, 5, std::nullopt}},
+          {{std::nullopt, 6}},
+          {{std::nullopt}},
+          std::nullopt,
+          {{}},
+      }),
+  });
+
+  testRoundTrip(data);
+}
+
+TEST_F(CompactRowTest, arrayOfTimestamp) {
+  auto data = makeRowVector({
+      makeArrayVector<Timestamp>({
+          {ts(1), ts(2), ts(3)},
+          {ts(4), ts(5)},
+          {ts(6)},
+          {},
+      }),
+  });
+
+  testRoundTrip(data);
+
+  data = makeRowVector({
+      makeNullableArrayVector<Timestamp>({
+          {{ts(1), ts(2), std::nullopt, ts(3)}},
+          {{ts(4), ts(5), std::nullopt}},
+          {{std::nullopt, ts(6)}},
+          {{std::nullopt}},
+          std::nullopt,
+          {{}},
+      }),
+  });
+
+  testRoundTrip(data);
+}
+
+TEST_F(CompactRowTest, arrayOfString) {
+  auto data = makeRowVector({
+      makeArrayVector<std::string>({
+          {"a", "abc", "Longer test string"},
+          {"b", "Abc 12345 ...test", "foo"},
+          {},
+      }),
+  });
+
+  testRoundTrip(data);
+
+  data = makeRowVector({
+      makeNullableArrayVector<std::string>({
+          {{"a", std::nullopt, "abc", "Longer test string"}},
+          {{std::nullopt,
+            "b",
+            std::nullopt,
+            "Abc 12345 ...test",
+            std::nullopt,
+            "foo"}},
+          {{}},
+          {{std::nullopt}},
+          std::nullopt,
+      }),
+  });
+
+  testRoundTrip(data);
+}
+
+TEST_F(CompactRowTest, map) {
+  auto data = makeRowVector({
+      makeMapVector<int16_t, int64_t>(
+          {{{1, 10}, {2, 20}, {3, 30}}, {{1, 11}, {2, 22}}, {{4, 444}}, {}}),
+  });
+
+  testRoundTrip(data);
+
+  data = makeRowVector({
+      makeMapVector<std::string, std::string>({
+          {{"a", "100"},
+           {"b", "200"},
+           {"Long string for testing", "Another long string"}},
+          {{"abc", "300"}, {"d", "400"}},
+          {},
+      }),
+  });
+
+  testRoundTrip(data);
+}
+
+TEST_F(CompactRowTest, row) {
+  auto data = makeRowVector({
+      makeRowVector({
+          makeFlatVector<int32_t>({1, 2, 3, 4, 5}),
+          makeFlatVector<double>({1.05, 2.05, 3.05, 4.05, 5.05}),
+      }),
+  });
+
+  testRoundTrip(data);
+
+  data = makeRowVector({
+      makeRowVector({
+          makeFlatVector<int32_t>({1, 2, 3, 4, 5}),
+          makeFlatVector<std::string>(
+              {"a", "Abc", "Long test string", "", "d"}),
+          makeFlatVector<double>({1.05, 2.05, 3.05, 4.05, 5.05}),
+      }),
+  });
+
+  testRoundTrip(data);
+
+  data = makeRowVector({
+      makeRowVector(
+          {
+              makeFlatVector<int32_t>({1, 2, 3, 4, 5}),
+              makeNullableFlatVector<int64_t>({-1, 2, -3, std::nullopt, -5}),
+              makeFlatVector<double>({1.05, 2.05, 3.05, 4.05, 5.05}),
+              makeFlatVector<std::string>(
+                  {"a", "Abc", "Long test string", "", "d"}),
+          },
+          nullEvery(2)),
+  });
+
+  testRoundTrip(data);
+}
+
+TEST_F(CompactRowTest, fuzz) {
+  auto rowType = ROW({
+      ROW({BIGINT(), VARCHAR(), DOUBLE()}),
+      MAP(VARCHAR(), ROW({ARRAY(BIGINT()), ARRAY(VARCHAR()), REAL()})),
+      ARRAY(ROW({BIGINT(), DOUBLE()})),
+      ARRAY(MAP(BIGINT(), DOUBLE())),
+      BIGINT(),
+      ARRAY(MAP(BIGINT(), VARCHAR())),
+      ARRAY(MAP(VARCHAR(), REAL())),
+      MAP(BIGINT(), ARRAY(BIGINT())),
+      BIGINT(),
+      ARRAY(BIGINT()),
+      DOUBLE(),
+      MAP(VARCHAR(), VARCHAR()),
+      VARCHAR(),
+      ARRAY(ARRAY(BIGINT())),
+      BIGINT(),
+      ARRAY(ARRAY(VARCHAR())),
+  });
+
+  VectorFuzzer::Options opts;
+  opts.vectorSize = 100;
+  opts.containerLength = 5;
+  opts.nullRatio = 0.1;
+  opts.containerHasNulls = true;
+  opts.dictionaryHasNulls = false;
+  opts.stringVariableLength = true;
+  opts.stringLength = 20;
+  opts.containerVariableLength = true;
+  opts.complexElementsMaxSize = 1'000;
+
+  // Spark uses microseconds to store timestamp
+  opts.timestampPrecision =
+      VectorFuzzer::Options::TimestampPrecision::kMicroSeconds;
+
+  VectorFuzzer fuzzer(opts, pool_.get());
+
+  const auto iterations = 200;
+  for (size_t i = 0; i < iterations; ++i) {
+    auto seed = folly::Random::rand32();
+
+    LOG(INFO) << i << ": seed: " << seed;
+    SCOPED_TRACE(fmt::format("seed: {}", seed));
+
+    fuzzer.reSeed(seed);
+    auto data = fuzzer.fuzzInputRow(rowType);
+
+    testRoundTrip(data);
+
+    if (Test::HasFailure()) {
+      break;
+    }
+  }
+}
+
+} // namespace
+} // namespace facebook::velox::row
diff --git a/velox/serializers/CMakeLists.txt b/velox/serializers/CMakeLists.txt
index 9fa18e048321..f9264fd3e57a 100644
--- a/velox/serializers/CMakeLists.txt
+++ b/velox/serializers/CMakeLists.txt
@@ -11,10 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-add_library(velox_presto_serializer PrestoSerializer.cpp
-                                    UnsafeRowSerializer.cpp)
+add_library(
+  velox_presto_serializer PrestoSerializer.cpp UnsafeRowSerializer.cpp
+                          CompactRowSerializer.cpp)
 
-target_link_libraries(velox_presto_serializer velox_vector)
+target_link_libraries(velox_presto_serializer velox_dwio_common velox_vector)
 
 if(${VELOX_BUILD_TESTING})
   add_subdirectory(tests)
diff --git a/velox/serializers/CompactRowSerializer.cpp b/velox/serializers/CompactRowSerializer.cpp
new file mode 100644
index 000000000000..eca7a60d8181
--- /dev/null
+++ b/velox/serializers/CompactRowSerializer.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "velox/serializers/CompactRowSerializer.h"
+#include <folly/lang/Bits.h>
+#include "velox/row/CompactRow.h"
+
+namespace facebook::velox::serializer {
+
+void CompactRowVectorSerde::estimateSerializedSize(
+    VectorPtr /* vector */,
+    const folly::Range<const IndexRange*>& /* ranges */,
+    vector_size_t** /* sizes */) {
+  VELOX_UNSUPPORTED();
+}
+
+namespace {
+class CompactRowVectorSerializer : public VectorSerializer {
+ public:
+  using TRowSize = uint32_t;
+
+  explicit CompactRowVectorSerializer(StreamArena* streamArena)
+      : pool_{streamArena->pool()} {}
+
+  void append(
+      const RowVectorPtr& vector,
+      const folly::Range<const IndexRange*>& ranges) override {
+    size_t totalSize = 0;
+    row::CompactRow row(vector);
+    if (auto fixedRowSize =
+            row::CompactRow::fixedRowSize(asRowType(vector->type()))) {
+      for (const auto& range : ranges) {
+        totalSize += (fixedRowSize.value() + sizeof(TRowSize)) * range.size;
+      }
+
+    } else {
+      for (const auto& range : ranges) {
+        for (auto i = range.begin; i < range.begin + range.size; ++i) {
+          totalSize += row.rowSize(i) + sizeof(TRowSize);
+        }
+      }
+    }
+
+    if (totalSize == 0) {
+      return;
+    }
+
+    BufferPtr buffer = AlignedBuffer::allocate<char>(totalSize, pool_, 0);
+    auto rawBuffer = buffer->asMutable<char>();
+    buffers_.push_back(std::move(buffer));
+
+    size_t offset = 0;
+    for (auto& range : ranges) {
+      for (auto i = range.begin; i < range.begin + range.size; ++i) {
+        // Write row data.
+        TRowSize size = row.serialize(i, rawBuffer + offset + sizeof(TRowSize));
+
+        // Write raw size. Needs to be in big endian order.
+        *(TRowSize*)(rawBuffer + offset) = folly::Endian::big(size);
+        offset += sizeof(TRowSize) + size;
+      }
+    }
+  }
+
+  void flush(OutputStream* stream) override {
+    for (const auto& buffer : buffers_) {
+      stream->write(buffer->as<char>(), buffer->size());
+    }
+    buffers_.clear();
+  }
+
+ private:
+  memory::MemoryPool* const FOLLY_NONNULL pool_;
+  std::vector<BufferPtr> buffers_;
+};
+} // namespace
+
+std::unique_ptr<VectorSerializer> CompactRowVectorSerde::createSerializer(
+    RowTypePtr /* type */,
+    int32_t /* numRows */,
+    StreamArena* streamArena,
+    const Options* /* options */) {
+  return std::make_unique<CompactRowVectorSerializer>(streamArena);
+}
+
+void CompactRowVectorSerde::deserialize(
+    ByteStream* source,
+    velox::memory::MemoryPool* pool,
+    RowTypePtr type,
+    RowVectorPtr* result,
+    const Options* /* options */) {
+  std::vector<std::string_view> serializedRows;
+  while (!source->atEnd()) {
+    // First read row size in big endian order.
+    auto rowSize = folly::Endian::big(
+        source->read<CompactRowVectorSerializer::TRowSize>());
+    auto row = source->nextView(rowSize);
+    VELOX_CHECK_EQ(row.size(), rowSize);
+    serializedRows.push_back(row);
+  }
+
+  if (serializedRows.empty()) {
+    *result = BaseVector::create<RowVector>(type, 0, pool);
+    return;
+  }
+
+  *result = velox::row::CompactRow::deserialize(serializedRows, type, pool);
+}
+
+// static
+void CompactRowVectorSerde::registerVectorSerde() {
+  velox::registerVectorSerde(std::make_unique<CompactRowVectorSerde>());
+}
+
+} // namespace facebook::velox::serializer
diff --git a/velox/serializers/CompactRowSerializer.h b/velox/serializers/CompactRowSerializer.h
new file mode 100644
index 000000000000..3ad0c99cbfa3
--- /dev/null
+++ b/velox/serializers/CompactRowSerializer.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "velox/vector/ComplexVector.h"
+#include "velox/vector/VectorStream.h"
+
+namespace facebook::velox::serializer {
+
+class CompactRowVectorSerde : public VectorSerde {
+ public:
+  CompactRowVectorSerde() = default;
+  // We do not implement this method since it is not used in production code.
+  void estimateSerializedSize(
+      VectorPtr vector,
+      const folly::Range<const IndexRange*>& ranges,
+      vector_size_t** sizes) override;
+
+  // This method is not used in production code. It is only used to
+  // support round-trip tests for deserialization.
+  std::unique_ptr<VectorSerializer> createSerializer(
+      RowTypePtr type,
+      int32_t numRows,
+      StreamArena* streamArena,
+      const Options* options) override;
+
+  // This method is used when reading data from the exchange.
+  void deserialize(
+      ByteStream* source,
+      velox::memory::MemoryPool* pool,
+      RowTypePtr type,
+      RowVectorPtr* result,
+      const Options* options) override;
+
+  static void registerVectorSerde();
+};
+
+} // namespace facebook::velox::serializer
diff --git a/velox/serializers/PrestoSerializer.cpp b/velox/serializers/PrestoSerializer.cpp
index 9c65f6d926db..904924d35f36 100644
--- a/velox/serializers/PrestoSerializer.cpp
+++ b/velox/serializers/PrestoSerializer.cpp
@@ -137,6 +137,18 @@ std::string typeToEncodingName(const TypePtr& type) {
   }
 }
 
+PrestoVectorSerde::PrestoOptions toPrestoOptions(
+    const VectorSerde::Options* options) {
+  if (options == nullptr) {
+    return PrestoVectorSerde::PrestoOptions();
+  }
+  return *(static_cast<const PrestoVectorSerde::PrestoOptions*>(options));
+}
+
+FOLLY_ALWAYS_INLINE bool needCompression(const folly::io::Codec& codec) {
+  return codec.type() != folly::io::CodecType::NO_COMPRESSION;
+}
+
 template <typename T>
 void readValues(
     ByteStream* source,
@@ -1558,7 +1570,10 @@ class PrestoVectorSerializer : public VectorSerializer {
       std::shared_ptr<const RowType> rowType,
       int32_t numRows,
       StreamArena* streamArena,
-      bool useLosslessTimestamp) {
+      bool useLosslessTimestamp,
+      common::CompressionKind compressionKind)
+      : streamArena_(streamArena),
+        codec_(common::compressionKindToCodec(compressionKind)) {
     auto types = rowType->children();
     auto numTypes = types.size();
     streams_.resize(numTypes);
@@ -1580,6 +1595,9 @@ class PrestoVectorSerializer : public VectorSerializer {
     }
   }
 
+  // The SerializedPage layout is:
+  // numRows(4) | codec(1) | uncompressedSize(4) | compressedSize(4) |
+  // checksum(8) | data
   void flush(OutputStream* out) override {
     flushInternal(numRows_, false /*rle*/, out);
   }
@@ -1596,21 +1614,18 @@ class PrestoVectorSerializer : public VectorSerializer {
     flushInternal(vector->size(), true /*rle*/, out);
   }
 
-  // Writes the contents to 'stream' in wire format
-  void flushInternal(int32_t numRows, bool rle, OutputStream* out) {
-    auto listener = dynamic_cast<PrestoOutputStreamListener*>(out->listener());
-    // Reset CRC computation
-    if (listener) {
-      listener->reset();
-    }
+ private:
+  void flushUncompressed(
+      int32_t numRows,
+      bool rle,
+      OutputStream* out,
+      PrestoOutputStreamListener* listener) {
+    int32_t offset = out->tellp();
 
     char codec = 0;
     if (listener) {
       codec = getCodecMarker();
     }
-
-    int32_t offset = out->tellp();
-
     // Pause CRC computation
     if (listener) {
       listener->pause();
@@ -1622,7 +1637,8 @@ class PrestoVectorSerializer : public VectorSerializer {
     // Make space for uncompressedSizeInBytes & sizeInBytes
     writeInt32(out, 0);
     writeInt32(out, 0);
-    writeInt64(out, 0); // Write zero checksum
+    // Write zero checksum.
+    writeInt64(out, 0);
 
     // Number of columns and stream content. Unpause CRC.
     if (listener) {
@@ -1662,10 +1678,92 @@ class PrestoVectorSerializer : public VectorSerializer {
     out->seekp(offset + size);
   }
 
- private:
+  void flushCompressed(
+      int32_t numRows,
+      bool rle,
+      OutputStream* output,
+      PrestoOutputStreamListener* listener) {
+    const int32_t offset = output->tellp();
+    char codec = kCompressedBitMask;
+    if (listener) {
+      codec |= kCheckSumBitMask;
+    }
+
+    // Pause CRC computation
+    if (listener) {
+      listener->pause();
+    }
+
+    writeInt32(output, numRows);
+    output->write(&codec, 1);
+
+    IOBufOutputStream out(
+        *(streamArena_->pool()), nullptr, streamArena_->size());
+    writeInt32(&out, streams_.size());
+    if (rle) {
+      // Write RLE encoding marker.
+      writeInt32(&out, kRLE.size());
+      out.write(kRLE.data(), kRLE.size());
+      // Write number of RLE values.
+      writeInt32(&out, numRows);
+    }
+
+    for (auto& stream : streams_) {
+      stream->flush(&out);
+    }
+    const int32_t uncompressedSize = out.tellp();
+    VELOX_CHECK_LE(
+        uncompressedSize,
+        codec_->maxUncompressedLength(),
+        "UncompressedSize exceeds limit");
+    auto compressed = codec_->compress(out.getIOBuf().get());
+    const int32_t compressedSize = compressed->length();
+    writeInt32(output, uncompressedSize);
+    writeInt32(output, compressedSize);
+    const int32_t crcOffset = output->tellp();
+    writeInt64(output, 0); // Write zero checksum
+    // Number of columns and stream content. Unpause CRC.
+    if (listener) {
+      listener->resume();
+    }
+    output->write(
+        reinterpret_cast<const char*>(compressed->writableData()),
+        compressed->length());
+    // Pause CRC computation
+    if (listener) {
+      listener->pause();
+    }
+    const int32_t endSize = output->tellp();
+    // Fill in crc
+    int64_t crc = 0;
+    if (listener) {
+      crc = computeChecksum(listener, codec, numRows, compressedSize);
+    }
+    output->seekp(crcOffset);
+    writeInt64(output, crc);
+    output->seekp(endSize);
+  }
+
+  // Writes the contents to 'stream' in wire format
+  void flushInternal(int32_t numRows, bool rle, OutputStream* out) {
+    auto listener = dynamic_cast<PrestoOutputStreamListener*>(out->listener());
+    // Reset CRC computation
+    if (listener) {
+      listener->reset();
+    }
+
+    if (!needCompression(*codec_)) {
+      flushUncompressed(numRows, rle, out, listener);
+    } else {
+      flushCompressed(numRows, rle, out, listener);
+    }
+  }
+
   static const int32_t kSizeInBytesOffset{4 + 1};
   static const int32_t kHeaderSize{kSizeInBytesOffset + 4 + 4 + 8};
 
+  StreamArena* const streamArena_;
+  const std::unique_ptr<folly::io::Codec> codec_;
   int32_t numRows_{0};
   std::vector<std::unique_ptr<VectorStream>> streams_;
 };
@@ -1683,11 +1781,13 @@ std::unique_ptr<VectorSerializer> PrestoVectorSerde::createSerializer(
     int32_t numRows,
     StreamArena* streamArena,
     const Options* options) {
-  bool useLosslessTimestamp = options != nullptr
-      ? static_cast<const PrestoOptions*>(options)->useLosslessTimestamp
-      : false;
+  auto prestoOptions = toPrestoOptions(options);
   return std::make_unique<PrestoVectorSerializer>(
-      type, numRows, streamArena, useLosslessTimestamp);
+      type,
+      numRows,
+      streamArena,
+      prestoOptions.useLosslessTimestamp,
+      prestoOptions.compressionKind);
 }
 
 void PrestoVectorSerde::serializeConstants(
@@ -1707,9 +1807,9 @@ void PrestoVectorSerde::deserialize(
     std::shared_ptr<const RowType> type,
     std::shared_ptr<RowVector>* result,
     const Options* options) {
-  bool useLosslessTimestamp = options != nullptr
-      ? static_cast<const PrestoOptions*>(options)->useLosslessTimestamp
-      : false;
+  auto prestoOptions = toPrestoOptions(options);
+  const bool useLosslessTimestamp = prestoOptions.useLosslessTimestamp;
+  auto codec = common::compressionKindToCodec(prestoOptions.compressionKind);
   auto numRows = source->read<int32_t>();
   if (!(*result) || !result->unique() || (*result)->type() != type) {
     *result = std::dynamic_pointer_cast<RowVector>(
@@ -1720,25 +1820,44 @@ void PrestoVectorSerde::deserialize(
 
   auto pageCodecMarker = source->read<int8_t>();
   auto uncompressedSize = source->read<int32_t>();
-  // skip size in bytes
-  source->skip(4);
+  auto compressedSize = source->read<int32_t>();
   auto checksum = source->read<int64_t>();
 
   int64_t actualCheckSum = 0;
   if (isChecksumBitSet(pageCodecMarker)) {
     actualCheckSum =
-        computeChecksum(source, pageCodecMarker, numRows, uncompressedSize);
+        computeChecksum(source, pageCodecMarker, numRows, compressedSize);
   }
 
   VELOX_CHECK_EQ(
       checksum, actualCheckSum, "Received corrupted serialized page.");
 
-  // skip number of columns
-  source->skip(4);
+  VELOX_CHECK_EQ(
+      needCompression(*codec),
+      isCompressedBitSet(pageCodecMarker),
+      "Compression kind {} should align with codec marker.",
+      common::compressionKindToString(
+          common::codecTypeToCompressionKind(codec->type())));
 
   auto children = &(*result)->children();
   auto childTypes = type->as<TypeKind::ROW>().children();
-  readColumns(source, pool, childTypes, children, useLosslessTimestamp);
+  if (!needCompression(*codec)) {
+    auto numColumns = source->read<int32_t>();
+    readColumns(source, pool, childTypes, children, useLosslessTimestamp);
+  } else {
+    auto compressBuf = folly::IOBuf::create(compressedSize);
+    source->readBytes(compressBuf->writableData(), compressedSize);
+    compressBuf->append(compressedSize);
+    auto uncompress = codec->uncompress(compressBuf.get(), uncompressedSize);
+    ByteRange byteRange{
+        uncompress->writableData(), (int32_t)uncompress->length(), 0};
+    ByteStream uncompressedSource;
+    uncompressedSource.resetInput({byteRange});
+    auto numColumns = uncompressedSource.read<int32_t>();
+    VELOX_CHECK_EQ(numColumns, type->as<TypeKind::ROW>().size());
+    readColumns(
+        &uncompressedSource, pool, childTypes, children, useLosslessTimestamp);
+  }
 }
 
 // static
diff --git a/velox/serializers/PrestoSerializer.h b/velox/serializers/PrestoSerializer.h
index b9202ae51136..c0f6099c2a25 100644
--- a/velox/serializers/PrestoSerializer.h
+++ b/velox/serializers/PrestoSerializer.h
@@ -15,6 +15,7 @@
  */
 #pragma once
 #include "velox/common/base/Crc.h"
+#include "velox/common/compression/Compression.h"
 #include "velox/vector/VectorStream.h"
 
 namespace facebook::velox::serializer::presto {
@@ -22,13 +23,21 @@ class PrestoVectorSerde : public VectorSerde {
  public:
   // Input options that the serializer recognizes.
   struct PrestoOptions : VectorSerde::Options {
-    explicit PrestoOptions(bool useLosslessTimestamp)
-        : useLosslessTimestamp(useLosslessTimestamp) {}
+    PrestoOptions() = default;
+
+    PrestoOptions(
+        bool _useLosslessTimestamp,
+        common::CompressionKind _compressionKind)
+        : useLosslessTimestamp(_useLosslessTimestamp),
+          compressionKind(_compressionKind) {}
+
     // Currently presto only supports millisecond precision and the serializer
     // converts velox native timestamp to that resulting in loss of precision.
     // This option allows it to serialize with nanosecond precision and is
     // currently used for spilling. Is false by default.
     bool useLosslessTimestamp{false};
+    common::CompressionKind compressionKind{
+        common::CompressionKind::CompressionKind_NONE};
   };
 
   void estimateSerializedSize(
diff --git a/velox/serializers/tests/CMakeLists.txt b/velox/serializers/tests/CMakeLists.txt
index 31e70511f74f..08cddb61f961 100644
--- a/velox/serializers/tests/CMakeLists.txt
+++ b/velox/serializers/tests/CMakeLists.txt
@@ -14,7 +14,7 @@
 add_executable(
   velox_presto_serializer_test
   PrestoOutputStreamListenerTest.cpp PrestoSerializerTest.cpp
-  UnsafeRowSerializerTest.cpp)
+  UnsafeRowSerializerTest.cpp CompactRowSerializerTest.cpp)
 
 add_test(velox_presto_serializer_test velox_presto_serializer_test)
 
diff --git a/velox/serializers/tests/CompactRowSerializerTest.cpp b/velox/serializers/tests/CompactRowSerializerTest.cpp
new file mode 100644
index 000000000000..31f21a95a7de
--- /dev/null
+++ b/velox/serializers/tests/CompactRowSerializerTest.cpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "velox/serializers/CompactRowSerializer.h"
+#include <gtest/gtest.h>
+#include "velox/vector/fuzzer/VectorFuzzer.h"
+#include "velox/vector/tests/utils/VectorTestBase.h"
+
+namespace facebook::velox::serializer {
+namespace {
+
+class CompactRowSerializerTest : public ::testing::Test,
+                                 public test::VectorTestBase {
+ protected:
+  void SetUp() override {
+    pool_ = memory::addDefaultLeafMemoryPool();
+    serde_ = std::make_unique<serializer::CompactRowVectorSerde>();
+  }
+
+  void serialize(RowVectorPtr rowVector, std::ostream* output) {
+    auto numRows = rowVector->size();
+
+    std::vector<IndexRange> rows(numRows);
+    for (int i = 0; i < numRows; i++) {
+      rows[i] = IndexRange{i, 1};
+    }
+
+    auto arena = std::make_unique<StreamArena>(pool_.get());
+    auto rowType = asRowType(rowVector->type());
+    auto serializer = serde_->createSerializer(rowType, numRows, arena.get());
+
+    serializer->append(rowVector, folly::Range(rows.data(), numRows));
+    OStreamOutputStream out(output);
+    serializer->flush(&out);
+  }
+
+  std::unique_ptr<ByteStream> toByteStream(const std::string_view& input) {
+    auto byteStream = std::make_unique<ByteStream>();
+    ByteRange byteRange{
+        reinterpret_cast<uint8_t*>(const_cast<char*>(input.data())),
+        (int32_t)input.length(),
+        0};
+    byteStream->resetInput({byteRange});
+    return byteStream;
+  }
+
+  RowVectorPtr deserialize(
+      const RowTypePtr& rowType,
+      const std::string_view& input) {
+    auto byteStream = toByteStream(input);
+
+    RowVectorPtr result;
+    serde_->deserialize(byteStream.get(), pool_.get(), rowType, &result);
+    return result;
+  }
+
+  void testRoundTrip(RowVectorPtr rowVector) {
+    std::ostringstream out;
+    serialize(rowVector, &out);
+
+    auto rowType = asRowType(rowVector->type());
+    auto deserialized = deserialize(rowType, out.str());
+    test::assertEqualVectors(deserialized, rowVector);
+  }
+
+  std::shared_ptr<memory::MemoryPool> pool_;
+  std::unique_ptr<VectorSerde> serde_;
+};
+
+TEST_F(CompactRowSerializerTest, fuzz) {
+  auto rowType = ROW({
+      BOOLEAN(),
+      TINYINT(),
+      SMALLINT(),
+      INTEGER(),
+      BIGINT(),
+      REAL(),
+      DOUBLE(),
+      VARCHAR(),
+      TIMESTAMP(),
+      ROW({VARCHAR(), INTEGER()}),
+      ARRAY(INTEGER()),
+      ARRAY(INTEGER()),
+      MAP(VARCHAR(), INTEGER()),
+      MAP(VARCHAR(), ARRAY(INTEGER())),
+  });
+
+  VectorFuzzer::Options opts;
+  opts.vectorSize = 5;
+  opts.nullRatio = 0.1;
+  opts.containerHasNulls = false;
+  opts.dictionaryHasNulls = false;
+  opts.stringVariableLength = true;
+  opts.stringLength = 20;
+  opts.containerVariableLength = false;
+
+  // Spark uses microseconds to store timestamp
+  opts.timestampPrecision =
+      VectorFuzzer::Options::TimestampPrecision::kMicroSeconds;
+  opts.containerLength = 10;
+
+  auto seed = folly::Random::rand32();
+
+  LOG(ERROR) << "Seed: " << seed;
+  SCOPED_TRACE(fmt::format("seed: {}", seed));
+  VectorFuzzer fuzzer(opts, pool_.get(), seed);
+
+  auto data = fuzzer.fuzzRow(rowType);
+  testRoundTrip(data);
+}
+
+} // namespace
+} // namespace facebook::velox::serializer
diff --git a/velox/serializers/tests/PrestoSerializerTest.cpp b/velox/serializers/tests/PrestoSerializerTest.cpp
index eedeb372b20b..450607ad85d6 100644
--- a/velox/serializers/tests/PrestoSerializerTest.cpp
+++ b/velox/serializers/tests/PrestoSerializerTest.cpp
@@ -27,8 +27,13 @@
 using namespace facebook::velox;
 using namespace facebook::velox::test;
 
-class PrestoSerializerTest : public ::testing::Test {
+class PrestoSerializerTest
+    : public ::testing::TestWithParam<common::CompressionKind> {
  protected:
+  static void SetUpTestCase() {
+    serializer::presto::PrestoVectorSerde::registerVectorSerde();
+  }
+
   void SetUp() override {
     pool_ = memory::addDefaultLeafMemoryPool();
     serde_ = std::make_unique<serializer::presto::PrestoVectorSerde>();
@@ -52,17 +57,30 @@ class PrestoSerializerTest : public ::testing::Test {
         rowVector, folly::Range(rows.data(), numRows), rawRowSizes.data());
   }
 
+  serializer::presto::PrestoVectorSerde::PrestoOptions getParamSerdeOptions(
+      const serializer::presto::PrestoVectorSerde::PrestoOptions*
+          serdeOptions) {
+    const bool useLosslessTimestamp =
+        serdeOptions == nullptr ? false : serdeOptions->useLosslessTimestamp;
+    common::CompressionKind kind = GetParam();
+    serializer::presto::PrestoVectorSerde::PrestoOptions paramOptions{
+        useLosslessTimestamp, kind};
+    return paramOptions;
+  }
+
   void serialize(
       const RowVectorPtr& rowVector,
       std::ostream* output,
-      const VectorSerde::Options* serdeOptions) {
+      const serializer::presto::PrestoVectorSerde::PrestoOptions*
+          serdeOptions) {
     sanityCheckEstimateSerializedSize(rowVector);
 
     auto arena = std::make_unique<StreamArena>(pool_.get());
     auto rowType = asRowType(rowVector->type());
     auto numRows = rowVector->size();
+    auto paramOptions = getParamSerdeOptions(serdeOptions);
     auto serializer =
-        serde_->createSerializer(rowType, numRows, arena.get(), serdeOptions);
+        serde_->createSerializer(rowType, numRows, arena.get(), &paramOptions);
 
     serializer->append(rowVector);
     facebook::velox::serializer::presto::PrestoOutputStreamListener listener;
@@ -73,11 +91,13 @@ class PrestoSerializerTest : public ::testing::Test {
   void serializeRle(
       const RowVectorPtr& rowVector,
       std::ostream* output,
-      const VectorSerde::Options* serdeOptions) {
+      const serializer::presto::PrestoVectorSerde::PrestoOptions*
+          serdeOptions) {
     facebook::velox::serializer::presto::PrestoOutputStreamListener listener;
     OStreamOutputStream out(output, &listener);
     auto arena = std::make_unique<StreamArena>(pool_.get());
-    serde_->serializeConstants(rowVector, arena.get(), serdeOptions, &out);
+    auto paramOptions = getParamSerdeOptions(serdeOptions);
+    serde_->serializeConstants(rowVector, arena.get(), &paramOptions, &out);
   }
 
   std::unique_ptr<ByteStream> toByteStream(const std::string& input) {
@@ -93,12 +113,13 @@ class PrestoSerializerTest : public ::testing::Test {
   RowVectorPtr deserialize(
       const RowTypePtr& rowType,
       const std::string& input,
-      const VectorSerde::Options* serdeOptions) {
+      const serializer::presto::PrestoVectorSerde::PrestoOptions*
+          serdeOptions) {
     auto byteStream = toByteStream(input);
-
+    auto paramOptions = getParamSerdeOptions(serdeOptions);
     RowVectorPtr result;
     serde_->deserialize(
-        byteStream.get(), pool_.get(), rowType, &result, serdeOptions);
+        byteStream.get(), pool_.get(), rowType, &result, &paramOptions);
     return result;
   }
 
@@ -115,7 +136,8 @@ class PrestoSerializerTest : public ::testing::Test {
 
   void testRoundTrip(
       VectorPtr vector,
-      const VectorSerde::Options* serdeOptions = nullptr) {
+      const serializer::presto::PrestoVectorSerde::PrestoOptions* serdeOptions =
+          nullptr) {
     auto rowVector = vectorMaker_->rowVector({vector});
     std::ostringstream out;
     serialize(rowVector, &out, serdeOptions);
@@ -125,13 +147,16 @@ class PrestoSerializerTest : public ::testing::Test {
     assertEqualVectors(deserialized, rowVector);
   }
 
-  void testRleRoundTrip(const VectorPtr& constantVector) {
+  void testRleRoundTrip(
+      const VectorPtr& constantVector,
+      const serializer::presto::PrestoVectorSerde::PrestoOptions* serdeOptions =
+          nullptr) {
     auto rowVector = vectorMaker_->rowVector({constantVector});
     std::ostringstream out;
-    serializeRle(rowVector, &out, nullptr);
+    serializeRle(rowVector, &out, serdeOptions);
 
     auto rowType = asRowType(rowVector->type());
-    auto deserialized = deserialize(rowType, out.str(), nullptr);
+    auto deserialized = deserialize(rowType, out.str(), serdeOptions);
 
     assertEqualVectors(rowVector, deserialized);
   }
@@ -141,7 +166,7 @@ class PrestoSerializerTest : public ::testing::Test {
   std::unique_ptr<test::VectorMaker> vectorMaker_;
 };
 
-TEST_F(PrestoSerializerTest, basic) {
+TEST_P(PrestoSerializerTest, basic) {
   vector_size_t numRows = 1'000;
   auto rowVector = makeTestVector(numRows);
   testRoundTrip(rowVector);
@@ -149,7 +174,7 @@ TEST_F(PrestoSerializerTest, basic) {
 
 /// Test serialization of a dictionary vector that adds nulls to the base
 /// vector.
-TEST_F(PrestoSerializerTest, dictionaryWithExtraNulls) {
+TEST_P(PrestoSerializerTest, dictionaryWithExtraNulls) {
   vector_size_t size = 1'000;
 
   auto base =
@@ -173,7 +198,7 @@ TEST_F(PrestoSerializerTest, dictionaryWithExtraNulls) {
   testRoundTrip(dictionary);
 }
 
-TEST_F(PrestoSerializerTest, emptyPage) {
+TEST_P(PrestoSerializerTest, emptyPage) {
   auto rowVector = vectorMaker_->rowVector(ROW({"a"}, {BIGINT()}), 0);
 
   std::ostringstream out;
@@ -184,7 +209,7 @@ TEST_F(PrestoSerializerTest, emptyPage) {
   assertEqualVectors(deserialized, rowVector);
 }
 
-TEST_F(PrestoSerializerTest, emptyArray) {
+TEST_P(PrestoSerializerTest, emptyArray) {
   auto arrayVector = vectorMaker_->arrayVector<int32_t>(
       1'000,
       [](vector_size_t row) { return row % 5; },
@@ -193,7 +218,7 @@ TEST_F(PrestoSerializerTest, emptyArray) {
   testRoundTrip(arrayVector);
 }
 
-TEST_F(PrestoSerializerTest, emptyMap) {
+TEST_P(PrestoSerializerTest, emptyMap) {
   auto mapVector = vectorMaker_->mapVector<int32_t, int32_t>(
       1'000,
       [](vector_size_t row) { return row % 5; },
@@ -203,7 +228,7 @@ TEST_F(PrestoSerializerTest, emptyMap) {
   testRoundTrip(mapVector);
 }
 
-TEST_F(PrestoSerializerTest, timestampWithTimeZone) {
+TEST_P(PrestoSerializerTest, timestampWithTimeZone) {
   auto timestamp = vectorMaker_->flatVector<int64_t>(
       100, [](auto row) { return 10'000 + row; });
   auto timezone =
@@ -225,7 +250,7 @@ TEST_F(PrestoSerializerTest, timestampWithTimeZone) {
   testRoundTrip(vector);
 }
 
-TEST_F(PrestoSerializerTest, intervalDayTime) {
+TEST_P(PrestoSerializerTest, intervalDayTime) {
   auto vector = vectorMaker_->flatVector<int64_t>(
       100,
       [](auto row) { return row + folly::Random::rand32(); },
@@ -241,7 +266,7 @@ TEST_F(PrestoSerializerTest, intervalDayTime) {
   testRoundTrip(vector);
 }
 
-TEST_F(PrestoSerializerTest, unknown) {
+TEST_P(PrestoSerializerTest, unknown) {
   const vector_size_t size = 123;
   auto constantVector =
       BaseVector::createNullConstant(UNKNOWN(), 123, pool_.get());
@@ -254,7 +279,7 @@ TEST_F(PrestoSerializerTest, unknown) {
   testRoundTrip(flatVector);
 }
 
-TEST_F(PrestoSerializerTest, multiPage) {
+TEST_P(PrestoSerializerTest, multiPage) {
   std::ostringstream out;
 
   // page 1
@@ -275,27 +300,29 @@ TEST_F(PrestoSerializerTest, multiPage) {
   auto byteStream = toByteStream(bytes);
 
   RowVectorPtr deserialized;
+  auto paramOptions = getParamSerdeOptions(nullptr);
   serde_->deserialize(
-      byteStream.get(), pool_.get(), rowType, &deserialized, nullptr);
+      byteStream.get(), pool_.get(), rowType, &deserialized, &paramOptions);
   ASSERT_FALSE(byteStream->atEnd());
   assertEqualVectors(deserialized, a);
 
   serde_->deserialize(
-      byteStream.get(), pool_.get(), rowType, &deserialized, nullptr);
+      byteStream.get(), pool_.get(), rowType, &deserialized, &paramOptions);
   assertEqualVectors(deserialized, b);
   ASSERT_FALSE(byteStream->atEnd());
 
   serde_->deserialize(
-      byteStream.get(), pool_.get(), rowType, &deserialized, nullptr);
+      byteStream.get(), pool_.get(), rowType, &deserialized, &paramOptions);
   assertEqualVectors(deserialized, c);
   ASSERT_TRUE(byteStream->atEnd());
 }
 
-TEST_F(PrestoSerializerTest, timestampWithNanosecondPrecision) {
+TEST_P(PrestoSerializerTest, timestampWithNanosecondPrecision) {
   // Verify that nanosecond precision is preserved when the right options are
   // passed to the serde.
   const serializer::presto::PrestoVectorSerde::PrestoOptions
-      kUseLosslessTimestampOptions(true);
+      kUseLosslessTimestampOptions(
+          true, common::CompressionKind::CompressionKind_NONE);
   auto timestamp = vectorMaker_->flatVector<Timestamp>(
       {Timestamp{0, 0},
        Timestamp{12, 0},
@@ -321,7 +348,7 @@ TEST_F(PrestoSerializerTest, timestampWithNanosecondPrecision) {
   assertEqualVectors(deserialized, expectedOutputWithLostPrecision);
 }
 
-TEST_F(PrestoSerializerTest, longDecimal) {
+TEST_P(PrestoSerializerTest, longDecimal) {
   std::vector<int128_t> decimalValues(102);
   decimalValues[0] = DecimalUtil::kLongDecimalMin;
   for (int row = 1; row < 101; row++) {
@@ -340,7 +367,7 @@ TEST_F(PrestoSerializerTest, longDecimal) {
   testRoundTrip(vector);
 }
 
-TEST_F(PrestoSerializerTest, rle) {
+TEST_P(PrestoSerializerTest, rle) {
   // Test RLE vectors with non-null value.
   testRleRoundTrip(
       BaseVector::createConstant(BOOLEAN(), true, 12, pool_.get()));
@@ -369,7 +396,7 @@ TEST_F(PrestoSerializerTest, rle) {
       MAP(VARCHAR(), INTEGER()), 17, pool_.get()));
 }
 
-TEST_F(PrestoSerializerTest, lazy) {
+TEST_P(PrestoSerializerTest, lazy) {
   constexpr int kSize = 1000;
   auto rowVector = makeTestVector(kSize);
   auto lazyVector = std::make_shared<LazyVector>(
@@ -380,9 +407,7 @@ TEST_F(PrestoSerializerTest, lazy) {
   testRoundTrip(lazyVector);
 }
 
-TEST_F(PrestoSerializerTest, ioBufRoundTrip) {
-  serializer::presto::PrestoVectorSerde::registerVectorSerde();
-
+TEST_P(PrestoSerializerTest, ioBufRoundTrip) {
   VectorFuzzer::Options opts;
   opts.timestampPrecision =
       VectorFuzzer::Options::TimestampPrecision::kMilliSeconds;
@@ -400,3 +425,13 @@ TEST_F(PrestoSerializerTest, ioBufRoundTrip) {
     assertEqualVectors(inputRowVector, outputRowVector);
   }
 }
+
+INSTANTIATE_TEST_SUITE_P(
+    PrestoSerializerTest,
+    PrestoSerializerTest,
+    ::testing::Values(
+        common::CompressionKind::CompressionKind_NONE,
+        common::CompressionKind::CompressionKind_ZLIB,
+        common::CompressionKind::CompressionKind_SNAPPY,
+        common::CompressionKind::CompressionKind_ZSTD,
+        common::CompressionKind::CompressionKind_LZ4));