diff --git a/.circleci/config.yml b/.circleci/config.yml index 2391d810f88d..e1b67e1502f2 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -19,6 +19,7 @@ workflows: dist-compile: jobs: - linux-build + - linux-pr-fuzzer-run - linux-build-options - linux-adapters - macos-build: @@ -161,7 +162,7 @@ commands: - run: name: "Run << parameters.fuzzer_name >> Fuzzer" command: | - eval " << parameters.fuzzer_exe >> << parameters.fuzzer_args >> " \ + eval ' << parameters.fuzzer_exe >> << parameters.fuzzer_args >> ' \ 2>&1 | tee "<< parameters.fuzzer_output >>" || ( \ tail -n 1000 "<< parameters.fuzzer_output >>" ; \ echo "FAIL: << parameters.fuzzer_name >> run failed"; \ @@ -357,34 +358,6 @@ jobs: name: "Run Example Binaries" command: | find _build/debug/velox/examples/ -maxdepth 1 -type f -executable -exec "{}" \; - - run: - name: "Build and Test PyVelox" - command: | - conda init bash - source ~/.bashrc - conda create -y --name pyveloxenv python=3.7 - conda activate pyveloxenv - LD_LIBRARY_PATH=/usr/local/lib make python-test - - run: - name: "Check function signatures" - command: | - source ~/.bashrc - conda activate pyveloxenv - pip install deepdiff - python ./scripts/signature.py export --spark --presto /tmp/pr_signatures.json - cp ./scripts/signature.py /tmp/signature.py - git remote add upstream https://github.com/facebookincubator/velox - git fetch upstream - merge_base=$(git merge-base 'upstream/main' `git rev-parse HEAD`) || \ - { echo "::error::Failed to find merge_base"; exit 1; } - echo "Merge Base: $merge_base" - git checkout $merge_base - git submodule update --init --recursive - LD_LIBRARY_PATH=/usr/local/lib make python-clean - LD_LIBRARY_PATH=/usr/local/lib make python-build - cp /tmp/signature.py ./scripts/signature.py - python ./scripts/signature.py export --spark --presto /tmp/main_signatures.json - python ./scripts/signature.py diff /tmp/main_signatures.json /tmp/pr_signatures.json - post-steps linux-build-release: @@ -671,3 +644,78 @@ jobs: git config --global user.name "velox" cd presto/presto-native-execution make runtime-container + + linux-pr-fuzzer-run: + executor: build + steps: + - pre-steps + - run: + name: "Get merge base function signatures" + command: | + source ~/.bashrc + conda create -y --name pyveloxenv python=3.7 + conda activate pyveloxenv + cp ./scripts/signature.py /tmp/signature.py + pip install deepdiff + git remote add upstream https://github.com/facebookincubator/velox + git fetch upstream + merge_base=$(git merge-base 'upstream/main' `git rev-parse HEAD`) || \ + { echo "::error::Failed to find merge_base"; exit 1; } + echo "Merge Base: $merge_base" + git checkout $merge_base + git submodule update --init --recursive + LD_LIBRARY_PATH=/usr/local/lib make python-clean + LD_LIBRARY_PATH=/usr/local/lib make python-build + python /tmp/signature.py export --spark spark_merge_base_signatures.json + python /tmp/signature.py export --presto presto_merge_base_signatures.json + - checkout + - run: + name: "Build" + command: | + make debug NUM_THREADS=16 MAX_HIGH_MEM_JOBS=8 MAX_LINK_JOBS=6 EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_ARROW=ON" + ccache -s + no_output_timeout: 1h + - run: + name: "Build and test PyVelox" + command: | + conda init bash + source ~/.bashrc + conda activate pyveloxenv + LD_LIBRARY_PATH=/usr/local/lib make python-test + - run: + name: "Check and create bias function signatures" + command: | + source ~/.bashrc + conda activate pyveloxenv + pip install deepdiff + python ./scripts/signature.py export --presto presto_pr_signatures.json + python ./scripts/signature.py export --spark spark_pr_signatures.json + python ./scripts/signature.py bias presto_merge_base_signatures.json presto_pr_signatures.json /tmp/presto_bias_functions + python ./scripts/signature.py bias spark_merge_base_signatures.json spark_pr_signatures.json /tmp/spark_bias_functions + + - fuzzer-run: + fuzzer_output: "/tmp/fuzzer.log" + fuzzer_repro: "/tmp/fuzzer_repro" + fuzzer_name: "Expression Bias Run" + fuzzer_exe: "if [ -f /tmp/presto_bias_functions ]; then _build/debug/velox/expression/tests/velox_expression_fuzzer_test" + fuzzer_args: " --seed ${RANDOM} --lazy_vector_generation_ratio 0.2 \ + --assign_function_tickets $(cat /tmp/presto_bias_functions) \ + --duration_sec 3600 --enable_variadic_signatures \ + --velox_fuzzer_enable_complex_types \ + --velox_fuzzer_enable_column_reuse \ + --velox_fuzzer_enable_expression_reuse \ + --max_expression_trees_per_step 2 \ + --retry_with_try \ + --enable_dereference \ + --logtostderr=1 --minloglevel=0 \ + --repro_persist_path=/tmp/fuzzer_repro ; fi" + + - fuzzer-run: + fuzzer_output: "/tmp/spark_fuzzer.log" + fuzzer_repro: "/tmp/spark_fuzzer_repro" + fuzzer_name: "Spark Bias Run" + fuzzer_exe: "if [ -f /tmp/spark_bias_functions ]; then _build/debug/velox/expression/tests/spark_expression_fuzzer_test" + fuzzer_args: " --seed ${RANDOM} --duration_sec 3600 --logtostderr=1 --minloglevel=0 \ + --assign_function_tickets $(cat /tmp/spark_bias_functions) \ + --repro_persist_path=/tmp/spark_fuzzer_repro ; fi" + diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index c1ddd372baef..6a140b12c1b3 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -193,8 +193,7 @@ jobs: ./scripts/benchmark-runner.py compare \ --baseline_path ${BASELINE_OUTPUT_PATH} \ --contender_path ${CONTENDER_OUTPUT_PATH} \ - --recursive \ - --do_not_fail + --recursive echo "::endgroup::" - name: "Save PR number" diff --git a/build/deps/github_hashes/facebook/folly-rev.txt b/build/deps/github_hashes/facebook/folly-rev.txt index fc9daf778045..8090f8779ccc 100644 --- a/build/deps/github_hashes/facebook/folly-rev.txt +++ b/build/deps/github_hashes/facebook/folly-rev.txt @@ -1 +1 @@ -Subproject commit dfeb9e3b20b41ba776d2789e035c1b36c96faa75 +Subproject commit d0254f0af28be32985a43159c3dd8156892f140c diff --git a/build/fbcode_builder/CMake/FBPythonBinary.cmake b/build/fbcode_builder/CMake/FBPythonBinary.cmake index 99c33fb8c953..f91ebaf32645 100644 --- a/build/fbcode_builder/CMake/FBPythonBinary.cmake +++ b/build/fbcode_builder/CMake/FBPythonBinary.cmake @@ -32,7 +32,7 @@ if(NOT TARGET Python3::Interpreter) # We find with QUIET here, since otherwise this generates some noisy warnings # on versions of CMake before 3.12 if (WIN32) - # On Windows we need both the Intepreter as well as the Development + # On Windows we need both the Interpreter as well as the Development # libraries. find_package(Python3 COMPONENTS Interpreter Development QUIET) else() @@ -487,7 +487,7 @@ function(add_fb_python_library LIB_NAME) # won't complain if one of the dependencies doesn't exist (since it is # intended to allow passing in file names for plain library files rather # than just targets). - # - It ensures that sources for our depencencies are built before any + # - It ensures that sources for our dependencies are built before any # executable that depends on us. Note that we depend on "${dep}.py_lib" # rather than "${dep}.py_sources_built" for this purpose because the # ".py_sources_built" target won't be available for imported targets. diff --git a/build/fbcode_builder/CMake/fb_py_test_main.py b/build/fbcode_builder/CMake/fb_py_test_main.py index e9ae5dd028a6..41626181b1ec 100644 --- a/build/fbcode_builder/CMake/fb_py_test_main.py +++ b/build/fbcode_builder/CMake/fb_py_test_main.py @@ -262,7 +262,7 @@ def stopTest(self, test): super(BuckTestResult, self).stopTest(test) - # If a failure occured during module/class setup, then this "test" may + # If a failure occurred during module/class setup, then this "test" may # actually be a `_ErrorHolder`, which doesn't contain explicit info # about the upcoming test. Since we really only care about the test # name field (i.e. `_testMethodName`), we use that to detect an actual diff --git a/build/fbcode_builder/getdeps.py b/build/fbcode_builder/getdeps.py index 565ef99135e7..9358c425e4aa 100755 --- a/build/fbcode_builder/getdeps.py +++ b/build/fbcode_builder/getdeps.py @@ -626,7 +626,7 @@ def run_project_cmd(self, args, loader, manifest): ) builder.build(install_dirs, reconfigure=reconfigure) - # If we are building the project (not depdendency) and a specific + # If we are building the project (not dependency) and a specific # cmake_target (not 'install') has been requested, then we don't # set the built_marker. This allows subsequent runs of getdeps.py # for the project to run with different cmake_targets to trigger diff --git a/build/fbcode_builder/getdeps/builder.py b/build/fbcode_builder/getdeps/builder.py index 4f0c809092f2..aa1b0f99601c 100644 --- a/build/fbcode_builder/getdeps/builder.py +++ b/build/fbcode_builder/getdeps/builder.py @@ -346,7 +346,7 @@ def _build(self, install_dirs, reconfigure) -> None: class Iproute2Builder(BuilderBase): # ./configure --prefix does not work for iproute2. - # Thus, explicitly copy sources from src_dir to build_dir, bulid, + # Thus, explicitly copy sources from src_dir to build_dir, build, # and then install to inst_dir using DESTDIR # lastly, also copy include from build_dir to inst_dir def __init__(self, build_opts, ctx, manifest, src_dir, build_dir, inst_dir) -> None: diff --git a/build/fbcode_builder/getdeps/cargo.py b/build/fbcode_builder/getdeps/cargo.py index 64a4e577b33e..09e00a39cf98 100644 --- a/build/fbcode_builder/getdeps/cargo.py +++ b/build/fbcode_builder/getdeps/cargo.py @@ -194,7 +194,7 @@ def _patchup_workspace(self, dep_to_git) -> None: my-rename-of-crate = { package = "crate", git = "..." } they can count themselves lucky because the code will raise an - Exception. There migh be more cases where the code will silently pass + Exception. There might be more cases where the code will silently pass producing bad results. """ workspace_dir = self.workspace_dir() @@ -362,7 +362,7 @@ def _resolve_dep_to_crates(self, build_source_dir, dep_to_git): dep_to_crates = {} - # First populate explicit crate paths from depedencies + # First populate explicit crate paths from dependencies for name, git_conf in dep_to_git.items(): crates = git_conf["crate_source_map"].keys() if crates: diff --git a/build/fbcode_builder/getdeps/envfuncs.py b/build/fbcode_builder/getdeps/envfuncs.py index 6072a69ec4db..60de6b23143e 100644 --- a/build/fbcode_builder/getdeps/envfuncs.py +++ b/build/fbcode_builder/getdeps/envfuncs.py @@ -32,7 +32,7 @@ def _key(self, key): # project uses `unicode_literals`. `subprocess` will raise an error # if the environment that it is passed has a mixture of byte and # unicode strings. - # It is simplest to force everthing to be `str` for the sake of + # It is simplest to force everything to be `str` for the sake of # consistency. key = str(key) if sys.platform.startswith("win"): diff --git a/build/fbcode_builder/manifests/lz4 b/build/fbcode_builder/manifests/lz4 index 2ce1ca9fd1ec..084d6a4aecd8 100644 --- a/build/fbcode_builder/manifests/lz4 +++ b/build/fbcode_builder/manifests/lz4 @@ -6,8 +6,8 @@ lz4 [rpms] lz4-devel -# centos (not centos_stream that is Meta internal) 8 is missing this -[rpms.not(all(distro=centos,distro_vers=8))] +# centos 8 and centos_stream 9 are missing this rpm +[rpms.not(any(all(distro=centos,distro_vers=8),all(distro=centos_stream,distro_vers=9)))] lz4-static [debs] diff --git a/scripts/setup-macos.sh b/scripts/setup-macos.sh index a5b1ccf794fb..83d8990603ea 100755 --- a/scripts/setup-macos.sh +++ b/scripts/setup-macos.sh @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# This script documents setting up a macOS host for presto_cpp +# This script documents setting up a macOS host for Velox # development. Running it should make you ready to compile. # # Environment variables: diff --git a/scripts/signature.py b/scripts/signature.py index 32d2bf361233..95a51b11371b 100644 --- a/scripts/signature.py +++ b/scripts/signature.py @@ -34,10 +34,10 @@ class bcolors: def export(args): """Exports Velox function signatures.""" if args.spark: - pv.register_spark_signatures("spark_") + pv.register_spark_signatures() if args.presto: - pv.register_presto_signatures("presto_") + pv.register_presto_signatures() signatures = pv.get_function_signatures() @@ -51,12 +51,15 @@ def export(args): return 0 -def diff(args): - """Diffs Velox function signatures.""" - first_signatures = json.load(args.first) - second_signatures = json.load(args.second) +def diff_signatures(base_signatures, contender_signatures): + """Diffs Velox function signatures. Returns a tuple of the delta diff and exit status""" + delta = DeepDiff( - first_signatures, second_signatures, ignore_order=True, report_repetition=True + base_signatures, + contender_signatures, + ignore_order=True, + report_repetition=True, + view="tree", ) exit_status = 0 if delta: @@ -93,10 +96,69 @@ def diff(args): """ ) - return exit_status + return delta, exit_status + + +def diff(args): + """Diffs Velox function signatures.""" + base_signatures = json.load(args.base) + contender_signatures = json.load(args.contender) + return diff_signatures(base_signatures, contender_signatures)[1] + + +def bias(args): + base_signatures = json.load(args.base) + contender_signatures = json.load(args.contender) + tickets = args.ticket_value + bias_output, status = bias_signatures( + base_signatures, contender_signatures, tickets + ) + if status: + return status + + if bias_output: + with open(args.output_path, "w") as f: + print(f"{bias_output}", file=f, end="") + + return 0 -def parse_args(): +def bias_signatures(base_signatures, contender_signatures, tickets): + """Returns newly added functions as string and a status flag. + Newly added functions are biased like so `fn_name1=,fn_name2=`. + If it detects incompatible changes returns 1 in the status and empty string. + """ + delta, status = diff_signatures(base_signatures, contender_signatures) + + # Return if the signature check call flags incompatible changes. + if status: + return "", status + + if not delta: + print(f"{bcolors.BOLD} No changes detected: Nothing to do!") + return "", 0 + + function_set = set() + for items in delta.values(): + for item in items: + function_set.add(item.get_root_key()) + + print(f"{bcolors.BOLD}Functions to be biased: {function_set}") + + if function_set: + return f"{f'={tickets},'.join(sorted(function_set)) + f'={tickets}'}", 0 + + return "", 0 + + +def get_tickets(val): + tickets = int(val) + if tickets < 0: + raise argparse.ArgumentTypeError("Cant have negative values!") + return tickets + + +def parse_args(args): global parser parser = argparse.ArgumentParser( @@ -111,16 +173,23 @@ def parse_args(): export_command_parser.add_argument("output_file", type=argparse.FileType("w")) diff_command_parser = command.add_parser("diff") - diff_command_parser.add_argument("first", type=argparse.FileType("r")) - diff_command_parser.add_argument("second", type=argparse.FileType("r")) - + diff_command_parser.add_argument("base", type=argparse.FileType("r")) + diff_command_parser.add_argument("contender", type=argparse.FileType("r")) + + bias_command_parser = command.add_parser("bias") + bias_command_parser.add_argument("base", type=argparse.FileType("r")) + bias_command_parser.add_argument("contender", type=argparse.FileType("r")) + bias_command_parser.add_argument("output_path") + bias_command_parser.add_argument( + "ticket_value", type=get_tickets, default=10, nargs="?" + ) parser.set_defaults(command="help") - return parser.parse_args() + return parser.parse_args(args) def main(): - args = parse_args() + args = parse_args(sys.argv[1:]) return globals()[args.command](args) diff --git a/scripts/tests/test_signature.py b/scripts/tests/test_signature.py new file mode 100644 index 000000000000..a32c1121c0b7 --- /dev/null +++ b/scripts/tests/test_signature.py @@ -0,0 +1,70 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from scripts.signature import bias_signatures +from pathlib import Path +import json + + +def read_from_file(file_path): + return Path(file_path).read_text() + + +def test_bias(base_signatures, contender_signatures): + return bias_signatures( + json.loads(base_signatures), json.loads(contender_signatures), 10 + ) + + +class SignatureTest(unittest.TestCase): + def test_bias(self): + # Remove a signature + _, return_value = test_bias( + """{"reverse": ["(array(T)) -> array(T)"]}""", + """{"reverse": []}""", + ) + + self.assertEqual(return_value, 1) + + # Add a new signature + bias_functions, _ = test_bias( + """{"reverse": ["(array(T)) -> array(T)"]}""", + """{"reverse": ["(array(T)) -> array(T)"], + "foo": ["(varchar) -> varchar"]}""", + ) + + self.assertEqual(bias_functions, "foo=10") + + # Modify a signature. + bias_functions, _ = test_bias( + """{"reverse": ["(array(T)) -> array(T)"]}""", + """{"reverse": ["(array(T)) -> array(T)", "(varchar) -> varchar"]}""", + ) + + self.assertEqual(bias_functions, "reverse=10") + + # Add more than one signature change + bias_functions, _ = test_bias( + """{"reverse": ["(array(T)) -> array(T)"]}""", + """{"reverse": ["(array(T)) -> array(T)"], + "foo": ["(varchar) -> varchar"], + "bar": ["(varchar) -> varchar"]}""", + ) + + self.assertEqual(bias_functions, "bar=10,foo=10") + + +if __name__ == "__main__": + unittest.main() diff --git a/velox/benchmarks/basic/CastBenchmark.cpp b/velox/benchmarks/basic/CastBenchmark.cpp index 6908f12d1eef..356b2a704e57 100644 --- a/velox/benchmarks/basic/CastBenchmark.cpp +++ b/velox/benchmarks/basic/CastBenchmark.cpp @@ -48,10 +48,14 @@ int main(int argc, char** argv) { "cast_int", vectorMaker.rowVector( {"valid", "empty", "nan"}, {validInput, invalidInput, nanInput})) - .addExpression("try_invalid_empty_input", "try_cast (empty as int)") - .addExpression("try_invalid_nan", "try_cast (nan as int)") - .addExpression("try_valid", "try_cast (valid as int)") - .addExpression("valid", "cast(valid as int)") + .addExpression("try_cast_invalid_empty_input", "try_cast (empty as int) ") + .addExpression( + "tryexpr_cast_invalid_empty_input", "try (cast (empty as int))") + .addExpression("try_cast_invalid_nan", "try_cast (nan as int)") + .addExpression("tryexpr_cast_invalid_nan", "try (cast (nan as int))") + .addExpression("try_cast_valid", "try_cast (valid as int)") + .addExpression("tryexpr_cast_valid", "try (cast (valid as int))") + .addExpression("cast_valid", "cast(valid as int)") .withIterations(100) .disableTesting(); diff --git a/velox/benchmarks/tpch/TpchBenchmark.cpp b/velox/benchmarks/tpch/TpchBenchmark.cpp index 53ba99feb6b6..8890a50a8e1b 100644 --- a/velox/benchmarks/tpch/TpchBenchmark.cpp +++ b/velox/benchmarks/tpch/TpchBenchmark.cpp @@ -230,9 +230,10 @@ class TpchBenchmark { static_cast(FLAGS_ssd_checkpoint_interval_gb) << 30); } - auto allocator = std::make_shared(options); - allocator_ = std::make_shared( - allocator, memoryBytes, std::move(ssdCache)); + allocator_ = std::make_shared(options); + cache_ = + cache::AsyncDataCache::create(allocator_.get(), std::move(ssdCache)); + cache::AsyncDataCache::setInstance(cache_.get()); memory::MemoryAllocator::setDefaultInstance(allocator_.get()); } functions::prestosql::registerAllScalarFunctions(); @@ -261,6 +262,10 @@ class TpchBenchmark { connector::registerConnector(hiveConnector); } + void shutdown() { + cache_->prepareShutdown(); + } + std::pair, std::vector> run( const TpchPlan& tpchPlan) { int32_t repeat = 0; @@ -382,15 +387,13 @@ class TpchBenchmark { } #endif - auto cache = dynamic_cast(allocator_.get()); - if (cache) { - cache->clear(); + if (cache_) { + cache_->clear(); } } if (FLAGS_clear_ssd_cache) { - auto cache = dynamic_cast(allocator_.get()); - if (cache) { - auto ssdCache = cache->ssdCache(); + if (cache_) { + auto ssdCache = cache_->ssdCache(); if (ssdCache) { ssdCache->clear(); } @@ -462,7 +465,7 @@ class TpchBenchmark { std::unique_ptr ioExecutor_; std::unique_ptr cacheExecutor_; std::shared_ptr allocator_; - + std::shared_ptr cache_; // Parameter combinations to try. Each element specifies a flag and possible // values. All permutations are tried. std::vector parameters_; @@ -578,6 +581,7 @@ int tpchBenchmarkMain() { } else { benchmark.runAllCombinations(); } + benchmark.shutdown(); queryBuilder.reset(); return 0; } diff --git a/velox/common/caching/AsyncDataCache.cpp b/velox/common/caching/AsyncDataCache.cpp index 92d54deebc8b..3e0ff2db0c7b 100644 --- a/velox/common/caching/AsyncDataCache.cpp +++ b/velox/common/caching/AsyncDataCache.cpp @@ -32,7 +32,7 @@ AsyncDataCacheEntry::AsyncDataCacheEntry(CacheShard* shard) : shard_(shard) { } AsyncDataCacheEntry::~AsyncDataCacheEntry() { - shard_->cache()->freeNonContiguous(data_); + shard_->cache()->allocator()->freeNonContiguous(data_); } void AsyncDataCacheEntry::setExclusiveToShared() { @@ -108,7 +108,7 @@ void AsyncDataCacheEntry::initialize(FileCacheKey key) { tinyData_.clear(); auto sizePages = bits::roundUp(size_, memory::AllocationTraits::kPageSize) / memory::AllocationTraits::kPageSize; - if (cache->allocateNonContiguous(sizePages, data_)) { + if (cache->allocator()->allocateNonContiguous(sizePages, data_)) { cache->incrementCachedPages(data().numPages()); } else { // No memory to cover 'this'. @@ -324,7 +324,7 @@ void CacheShard::removeEntryLocked(AsyncDataCacheEntry* entry) { auto numPages = entry->data().numPages(); if (numPages) { cache_->incrementCachedPages(-numPages); - cache_->freeNonContiguous(entry->data()); + cache_->allocator()->freeNonContiguous(entry->data()); } } } @@ -412,7 +412,7 @@ void CacheShard::evict(uint64_t bytesToFree, bool evictAllUnpinned) { void CacheShard::freeAllocations(std::vector& allocations) { for (auto& allocation : allocations) { - cache_->freeNonContiguous(allocation); + cache_->allocator()->freeNonContiguous(allocation); } allocations.clear(); } @@ -495,8 +495,7 @@ void CacheShard::appendSsdSaveable(std::vector& pins) { } AsyncDataCache::AsyncDataCache( - const std::shared_ptr& allocator, - uint64_t /* maxBytes */, + memory::MemoryAllocator* allocator, std::unique_ptr ssdCache) : allocator_(allocator), ssdCache_(std::move(ssdCache)), cachedPages_(0) { for (auto i = 0; i < kNumShards; ++i) { @@ -504,15 +503,44 @@ AsyncDataCache::AsyncDataCache( } } -AsyncDataCache::AsyncDataCache( - const std::shared_ptr& allocator, - std::unique_ptr ssdCache) - : allocator_(allocator), ssdCache_(std::move(ssdCache)), cachedPages_(0) { - for (auto i = 0; i < kNumShards; ++i) { - shards_.push_back(std::make_unique(this)); +AsyncDataCache::~AsyncDataCache() {} + +// static +std::shared_ptr AsyncDataCache::create( + memory::MemoryAllocator* allocator, + std::unique_ptr ssdCache) { + auto cache = std::make_shared(allocator, std::move(ssdCache)); + allocator->registerCache(cache); + return cache; +} + +// static +AsyncDataCache* AsyncDataCache::getInstance() { + return *getInstancePtr(); +} + +// static +void AsyncDataCache::setInstance(AsyncDataCache* asyncDataCache) { + *getInstancePtr() = asyncDataCache; +} + +// static +AsyncDataCache** AsyncDataCache::getInstancePtr() { + static AsyncDataCache* cache_{nullptr}; + return &cache_; +} + +void AsyncDataCache::prepareShutdown() { + for (auto& shard : shards_) { + shard->prepareShutdown(); } } +void CacheShard::prepareShutdown() { + entries_.clear(); + freeEntries_.clear(); +} + CachePin AsyncDataCache::findOrCreate( RawFileCacheKey key, uint64_t size, @@ -611,50 +639,6 @@ void AsyncDataCache::backoff(int32_t counter) { std::this_thread::sleep_for(std::chrono::microseconds(usec)); // NOLINT } -bool AsyncDataCache::allocateNonContiguous( - MachinePageCount numPages, - memory::Allocation& out, - ReservationCallback reservationCB, - MachinePageCount minSizeClass) { - return makeSpace(numPages, [&]() { - return allocator_->allocateNonContiguous( - numPages, out, reservationCB, minSizeClass); - }); -} - -bool AsyncDataCache::allocateContiguous( - memory::MachinePageCount numPages, - memory::Allocation* collateral, - memory::ContiguousAllocation& allocation, - ReservationCallback reservationCB, - memory::MachinePageCount maxPages) { - return makeSpace(numPages, [&]() { - return allocator_->allocateContiguous( - numPages, collateral, allocation, reservationCB, maxPages); - }); -} - -bool AsyncDataCache::growContiguous( - MachinePageCount increment, - memory::ContiguousAllocation& allocation, - ReservationCallback reservationCB) { - return makeSpace(increment, [&]() { - return allocator_->growContiguous(increment, allocation, reservationCB); - }); -} - -void* AsyncDataCache::allocateBytes(uint64_t bytes, uint16_t alignment) { - void* result = nullptr; - makeSpace( - bits::roundUp(bytes, memory::AllocationTraits::kPageSize) / - memory::AllocationTraits::kPageSize, - [&]() { - result = allocator_->allocateBytes(bytes, alignment); - return result != nullptr; - }); - return result; -} - void AsyncDataCache::incrementNew(uint64_t size) { newBytes_ += size; if (!ssdCache_) { @@ -725,7 +709,7 @@ std::string AsyncDataCache::toString() const { << " read pins " << stats.numShared << " write pins " << stats.numExclusive << " unused prefetch " << stats.numPrefetch << " Alloc Megaclocks " << (stats.allocClocks >> 20) - << " allocated pages " << numAllocated() << " cached pages " + << " allocated pages " << allocator_->numAllocated() << " cached pages " << cachedPages_; out << "\nBacking: " << allocator_->toString(); if (ssdCache_) { diff --git a/velox/common/caching/AsyncDataCache.h b/velox/common/caching/AsyncDataCache.h index 2dd8a9098b2f..2a60f8f273f6 100644 --- a/velox/common/caching/AsyncDataCache.h +++ b/velox/common/caching/AsyncDataCache.h @@ -506,30 +506,36 @@ struct CacheStats { std::shared_ptr ssdStats = nullptr; }; -// Collection of cache entries whose key hashes to the same shard of -// the hash number space. The cache population is divided into shards -// to decrease contention on the mutex for the key to entry mapping -// and other housekeeping. + +/// Collection of cache entries whose key hashes to the same shard of +/// the hash number space. The cache population is divided into shards +/// to decrease contention on the mutex for the key to entry mapping +/// and other housekeeping. class CacheShard { public: explicit CacheShard(AsyncDataCache* FOLLY_NONNULL cache) : cache_(cache) {} - // See AsyncDataCache::findOrCreate. + /// See AsyncDataCache::findOrCreate. CachePin findOrCreate( RawFileCacheKey key, uint64_t size, folly::SemiFuture* readyFuture); - // Returns true if there is an entry for 'key'. Updates access time. + /// Returns true if there is an entry for 'key'. Updates access time. bool exists(RawFileCacheKey key) const; - AsyncDataCache* cache() { + AsyncDataCache* cache() const { return cache_; } + std::mutex& mutex() { return mutex_; } + /// Release any resources that consume memory from this 'CacheShard' for a + /// graceful shutdown. The shard will no longer be valid after this call. + void prepareShutdown(); + // removes 'bytesToFree' worth of entries or as many entries as are // not pinned. This favors first removing older and less frequently // used entries. If 'evictAllUnpinned' is true, anything that is @@ -609,101 +615,63 @@ class CacheShard { std::atomic allocClocks_; }; -class AsyncDataCache : public memory::MemoryAllocator { +class AsyncDataCache : public memory::Cache { public: - // TODO(jtan6): Remove this constructor after Presto Native switches to below - // constructor AsyncDataCache( - const std::shared_ptr& allocator, - uint64_t maxBytes, + memory::MemoryAllocator* allocator, std::unique_ptr ssdCache = nullptr); - AsyncDataCache( - const std::shared_ptr& allocator, + ~AsyncDataCache() override; + + static std::shared_ptr create( + memory::MemoryAllocator* allocator, std::unique_ptr ssdCache = nullptr); - // Finds or creates a cache entry corresponding to 'key'. The entry - // is returned in 'pin'. If the entry is new, it is pinned in - // exclusive mode and its 'data_' has uninitialized space for at - // least 'size' bytes. If the entry is in cache and already filled, - // the pin is in shared mode. If the entry is in exclusive mode for - // some other pin, the pin is empty. If 'waitFuture' is not nullptr - // and the pin is exclusive on some other pin, this is set to a - // future that is realized when the pin is no longer exclusive. When - // the future is realized, the caller may retry findOrCreate(). - // runtime error with code kNoCacheSpace if there is no space to create the - // new entry after evicting any unpinned content. + static AsyncDataCache* getInstance(); + + static void setInstance(AsyncDataCache* asyncDataCache); + + /// Release any resources that consume memory from 'allocator_' for a graceful + /// shutdown. The cache will no longer be valid after this call. + void prepareShutdown(); + + /// Calls 'allocate' until this returns true. Returns true if + /// allocate returns true. and Tries to evict at least 'numPages' of + /// cache after each failed call to 'allocate'. May pause to wait + /// for SSD cache flush if ''ssdCache_' is set and is busy + /// writing. Does random back-off after several failures and + /// eventually gives up. Allocation must not be serialized by a mutex + /// for memory arbitration to work. + bool makeSpace( + memory::MachinePageCount numPages, + std::function allocate) override; + + memory::MemoryAllocator* allocator() const override { + return allocator_; + } + + /// Finds or creates a cache entry corresponding to 'key'. The entry + /// is returned in 'pin'. If the entry is new, it is pinned in + /// exclusive mode and its 'data_' has uninitialized space for at + /// least 'size' bytes. If the entry is in cache and already filled, + /// the pin is in shared mode. If the entry is in exclusive mode for + /// some other pin, the pin is empty. If 'waitFuture' is not nullptr + /// and the pin is exclusive on some other pin, this is set to a + /// future that is realized when the pin is no longer exclusive. When + /// the future is realized, the caller may retry findOrCreate(). + /// runtime error with code kNoCacheSpace if there is no space to create the + /// new entry after evicting any unpinned content. CachePin findOrCreate( RawFileCacheKey key, uint64_t size, folly::SemiFuture* waitFuture = nullptr); - // Returns true if there is an entry for 'key'. Updates access time. + /// Returns true if there is an entry for 'key'. Updates access time. bool exists(RawFileCacheKey key) const; - Kind kind() const override { - return allocator_->kind(); - } - - size_t capacity() const override { - return allocator_->capacity(); - } - - bool allocateNonContiguous( - memory::MachinePageCount numPages, - memory::Allocation& out, - ReservationCallback reservationCB = nullptr, - memory::MachinePageCount minSizeClass = 0) override; - - int64_t freeNonContiguous(memory::Allocation& allocation) override { - return allocator_->freeNonContiguous(allocation); - } - - bool allocateContiguous( - memory::MachinePageCount numPages, - memory::Allocation* FOLLY_NULLABLE collateral, - memory::ContiguousAllocation& allocation, - ReservationCallback reservationCB = nullptr, - memory::MachinePageCount maxPages = 0) override; - - void freeContiguous(memory::ContiguousAllocation& allocation) override { - allocator_->freeContiguous(allocation); - } - - bool growContiguous( - memory::MachinePageCount increment, - memory::ContiguousAllocation& allocation, - ReservationCallback reservationCB = nullptr) override; - - void* allocateBytes(uint64_t bytes, uint16_t alignment) override; - - void freeBytes(void* p, uint64_t size) noexcept override { - allocator_->freeBytes(p, size); - } - - bool checkConsistency() const override { - return allocator_->checkConsistency(); - } - - const std::vector& sizeClasses() const override { - return allocator_->sizeClasses(); - } - - size_t totalUsedBytes() const override { - return allocator_->totalUsedBytes(); - } - - memory::MachinePageCount numAllocated() const override { - return allocator_->numAllocated(); - } - - memory::MachinePageCount numMapped() const override { - return allocator_->numMapped(); - } - CacheStats refreshStats() const; - std::string toString() const override; + std::string toString() const; memory::MachinePageCount incrementCachedPages(int64_t pages) { // The counter is unsigned and the increment is signed. @@ -719,19 +687,19 @@ class AsyncDataCache : public memory::MemoryAllocator { return ssdCache_.get(); } - // Updates stats for creation of a new cache entry of 'size' bytes, - // i.e. a cache miss. Periodically updates SSD admission criteria, - // i.e. reconsider criteria every half cache capacity worth of misses. + /// Updates stats for creation of a new cache entry of 'size' bytes, + /// i.e. a cache miss. Periodically updates SSD admission criteria, + /// i.e. reconsider criteria every half cache capacity worth of misses. void incrementNew(uint64_t size); - // Updates statistics after bringing in 'bytes' worth of data that - // qualifies for SSD save and is not backed by SSD. Periodically - // triggers a background write of eligible entries to SSD. + /// Updates statistics after bringing in 'bytes' worth of data that + /// qualifies for SSD save and is not backed by SSD. Periodically + /// triggers a background write of eligible entries to SSD. void possibleSsdSave(uint64_t bytes); - // Sets a callback applied to new entries at the point where - // they are set to shared mode. Used for testing and can be used for - // e.g. checking checksums. + /// Sets a callback applied to new entries at the point where + /// they are set to shared mode. Used for testing and can be used for + /// e.g. checking checksums. void setVerifyHook(std::function hook) { verifyHook_ = hook; } @@ -769,29 +737,16 @@ class AsyncDataCache : public memory::MemoryAllocator { return numSkippedSaves_; } - memory::Stats stats() const override { - return allocator_->stats(); - } - private: static constexpr int32_t kNumShards = 4; // Must be power of 2. static constexpr int32_t kShardMask = kNumShards - 1; + static AsyncDataCache** getInstancePtr(); + // Waits a pseudorandom delay times 'counter'. void backoff(int32_t counter); - // Calls 'allocate' until this returns true. Returns true if - // allocate returns true. and Tries to evict at least 'numPages' of - // cache after each failed call to 'allocate'. May pause to wait - // for SSD cache flush if ''ssdCache_' is set and is busy - // writing. Does random back-off after several failures and - // eventually gives up. Allocation must not be serialized by a mutex - // for memory arbitration to work. - bool makeSpace( - memory::MachinePageCount numPages, - std::function allocate); - - std::shared_ptr allocator_; + memory::MemoryAllocator* const allocator_; std::unique_ptr ssdCache_; std::vector> shards_; std::atomic shardCounter_{0}; diff --git a/velox/common/caching/tests/AsyncDataCacheTest.cpp b/velox/common/caching/tests/AsyncDataCacheTest.cpp index 1c3580a9d860..66bb6dd65cbd 100644 --- a/velox/common/caching/tests/AsyncDataCacheTest.cpp +++ b/velox/common/caching/tests/AsyncDataCacheTest.cpp @@ -81,6 +81,7 @@ class AsyncDataCacheTest : public testing::Test { if (ssdCache) { ssdCache->deleteFiles(); } + cache_->prepareShutdown(); } } @@ -104,18 +105,21 @@ class AsyncDataCacheTest : public testing::Test { } memory::MmapAllocator::Options options; options.capacity = maxBytes; - cache_ = std::make_shared( - std::make_shared(options), - maxBytes, - std::move(ssdCache)); + if (cache_) { + cache_->prepareShutdown(); + } + cache_.reset(); + allocator_.reset(); + allocator_ = std::make_shared(options); + cache_ = AsyncDataCache::create(allocator_.get(), std::move(ssdCache)); if (filenames_.empty()) { for (auto i = 0; i < kNumFiles; ++i) { auto name = fmt::format("testing_file_{}", i); filenames_.push_back(StringIdLease(fileIds(), name)); } } - ASSERT_EQ(cache_->kind(), MemoryAllocator::Kind::kMmap); - ASSERT_EQ(MemoryAllocator::kindString(cache_->kind()), "MMAP"); + ASSERT_EQ(cache_->allocator()->kind(), MemoryAllocator::Kind::kMmap); + ASSERT_EQ(MemoryAllocator::kindString(cache_->allocator()->kind()), "MMAP"); } // Finds one entry from RAM, SSD or storage. Throws if the data @@ -222,12 +226,13 @@ class AsyncDataCacheTest : public testing::Test { void clearAllocations(std::deque& allocations) { while (!allocations.empty()) { - cache_->freeNonContiguous(allocations.front()); + allocator_->freeNonContiguous(allocations.front()); allocations.pop_front(); } } std::shared_ptr tempDirectory_; + std::shared_ptr allocator_; std::shared_ptr cache_; std::vector filenames_; std::unique_ptr executor_; @@ -588,7 +593,7 @@ TEST_F(AsyncDataCacheTest, outOfCapacity) { pins.pop_front(); } memory::Allocation allocation; - ASSERT_FALSE(cache_->allocateNonContiguous(kSizeInPages, allocation)); + ASSERT_FALSE(allocator_->allocateNonContiguous(kSizeInPages, allocation)); // One 4 page entry below the max size of 4K 4 page entries in 16MB of // capacity. ASSERT_EQ(16384, cache_->incrementCachedPages(0)); @@ -597,14 +602,14 @@ TEST_F(AsyncDataCacheTest, outOfCapacity) { // We allocate the full capacity and expect the cache entries to go. for (;;) { - if (!cache_->allocateNonContiguous(kSizeInPages, allocation)) { + if (!allocator_->allocateNonContiguous(kSizeInPages, allocation)) { break; } allocations.push_back(std::move(allocation)); } EXPECT_EQ(0, cache_->incrementCachedPages(0)); EXPECT_EQ(0, cache_->incrementPrefetchPages(0)); - EXPECT_EQ(16384, cache_->numAllocated()); + EXPECT_EQ(16384, allocator_->numAllocated()); clearAllocations(allocations); } diff --git a/velox/common/caching/tests/SsdFileTest.cpp b/velox/common/caching/tests/SsdFileTest.cpp index 33ed1cf7f7ca..c4d7c93df2bd 100644 --- a/velox/common/caching/tests/SsdFileTest.cpp +++ b/velox/common/caching/tests/SsdFileTest.cpp @@ -45,6 +45,9 @@ class SsdFileTest : public testing::Test { if (ssdFile_) { ssdFile_->deleteFile(); } + if (cache_) { + cache_->prepareShutdown(); + } } void initializeCache( @@ -53,8 +56,7 @@ class SsdFileTest : public testing::Test { bool setNoCowFlag = false) { // tmpfs does not support O_DIRECT, so turn this off for testing. FLAGS_ssd_odirect = false; - cache_ = std::make_shared( - MemoryAllocator::createDefaultInstance(), maxBytes); + cache_ = AsyncDataCache::create(MemoryAllocator::getInstance()); fileName_ = StringIdLease(fileIds(), "fileInStorage"); diff --git a/velox/common/compression/Compression.cpp b/velox/common/compression/Compression.cpp index c200a62d7021..eae2db24379c 100644 --- a/velox/common/compression/Compression.cpp +++ b/velox/common/compression/Compression.cpp @@ -15,11 +15,47 @@ */ #include "velox/common/compression/Compression.h" +#include "velox/common/base/Exceptions.h" #include namespace facebook::velox::common { +std::unique_ptr compressionKindToCodec(CompressionKind kind) { + switch (static_cast(kind)) { + case CompressionKind_NONE: + return getCodec(folly::io::CodecType::NO_COMPRESSION); + case CompressionKind_ZLIB: + return getCodec(folly::io::CodecType::ZLIB); + case CompressionKind_SNAPPY: + return getCodec(folly::io::CodecType::SNAPPY); + case CompressionKind_ZSTD: + return getCodec(folly::io::CodecType::ZSTD); + case CompressionKind_LZ4: + return getCodec(folly::io::CodecType::LZ4); + default: + VELOX_UNSUPPORTED( + "Not support {} in folly", compressionKindToString(kind)); + } +} + +CompressionKind codecTypeToCompressionKind(folly::io::CodecType type) { + switch (type) { + case folly::io::CodecType::NO_COMPRESSION: + return CompressionKind_NONE; + case folly::io::CodecType::ZLIB: + return CompressionKind_ZLIB; + case folly::io::CodecType::SNAPPY: + return CompressionKind_SNAPPY; + case folly::io::CodecType::ZSTD: + return CompressionKind_ZSTD; + case folly::io::CodecType::LZ4: + return CompressionKind_LZ4; + default: + VELOX_UNSUPPORTED("Not support folly codec type {}", type); + } +} + std::string compressionKindToString(CompressionKind kind) { switch (static_cast(kind)) { case CompressionKind_NONE: diff --git a/velox/common/compression/Compression.h b/velox/common/compression/Compression.h index 2262c7785911..c1af44fd606a 100644 --- a/velox/common/compression/Compression.h +++ b/velox/common/compression/Compression.h @@ -16,6 +16,7 @@ #pragma once +#include #include namespace facebook::velox::common { @@ -31,6 +32,10 @@ enum CompressionKind { CompressionKind_MAX = INT64_MAX }; +std::unique_ptr compressionKindToCodec(CompressionKind kind); + +CompressionKind codecTypeToCompressionKind(folly::io::CodecType type); + /** * Get the name of the CompressionKind. */ diff --git a/velox/common/compression/tests/CompressionTest.cpp b/velox/common/compression/tests/CompressionTest.cpp index b15e9207ef1e..0659fa18222e 100644 --- a/velox/common/compression/tests/CompressionTest.cpp +++ b/velox/common/compression/tests/CompressionTest.cpp @@ -24,7 +24,7 @@ using namespace facebook::velox::common; class CompressionTest : public testing::Test {}; -TEST(CompressionTest, testCompressionNames) { +TEST_F(CompressionTest, testCompressionNames) { EXPECT_EQ("none", compressionKindToString(CompressionKind_NONE)); EXPECT_EQ("zlib", compressionKindToString(CompressionKind_ZLIB)); EXPECT_EQ("snappy", compressionKindToString(CompressionKind_SNAPPY)); @@ -35,3 +35,15 @@ TEST(CompressionTest, testCompressionNames) { "unknown - 99", compressionKindToString(static_cast(99))); } + +TEST_F(CompressionTest, compressionKindToCodec) { + ASSERT_EQ( + folly::io::CodecType::NO_COMPRESSION, + compressionKindToCodec(CompressionKind::CompressionKind_NONE)->type()); + ASSERT_EQ( + folly::io::CodecType::LZ4, + compressionKindToCodec(CompressionKind::CompressionKind_LZ4)->type()); + EXPECT_THROW( + compressionKindToCodec(CompressionKind::CompressionKind_LZO), + facebook::velox::VeloxException); +} diff --git a/velox/common/memory/MallocAllocator.cpp b/velox/common/memory/MallocAllocator.cpp index df63562916cd..73c35c18a3f1 100644 --- a/velox/common/memory/MallocAllocator.cpp +++ b/velox/common/memory/MallocAllocator.cpp @@ -23,7 +23,7 @@ namespace facebook::velox::memory { MallocAllocator::MallocAllocator(size_t capacity) : kind_(MemoryAllocator::Kind::kMalloc), capacity_(capacity) {} -bool MallocAllocator::allocateNonContiguous( +bool MallocAllocator::allocateNonContiguousWithoutRetry( MachinePageCount numPages, Allocation& out, ReservationCallback reservationCB, @@ -109,6 +109,20 @@ bool MallocAllocator::allocateNonContiguous( return true; } +bool MallocAllocator::allocateContiguousWithoutRetry( + MachinePageCount numPages, + Allocation* collateral, + ContiguousAllocation& allocation, + ReservationCallback reservationCB, + MachinePageCount maxPages) { + bool result; + stats_.recordAllocate(AllocationTraits::pageBytes(numPages), 1, [&]() { + result = allocateContiguousImpl( + numPages, collateral, allocation, reservationCB, maxPages); + }); + return result; +} + bool MallocAllocator::allocateContiguousImpl( MachinePageCount numPages, Allocation* collateral, @@ -216,6 +230,11 @@ int64_t MallocAllocator::freeNonContiguous(Allocation& allocation) { return freedBytes; } +void MallocAllocator::freeContiguous(ContiguousAllocation& allocation) { + stats_.recordFree( + allocation.size(), [&]() { freeContiguousImpl(allocation); }); +} + void MallocAllocator::freeContiguousImpl(ContiguousAllocation& allocation) { if (allocation.empty()) { return; @@ -233,7 +252,7 @@ void MallocAllocator::freeContiguousImpl(ContiguousAllocation& allocation) { allocation.clear(); } -bool MallocAllocator::growContiguous( +bool MallocAllocator::growContiguousWithoutRetry( MachinePageCount increment, ContiguousAllocation& allocation, ReservationCallback reservationCB) { @@ -259,7 +278,9 @@ bool MallocAllocator::growContiguous( return true; } -void* MallocAllocator::allocateBytes(uint64_t bytes, uint16_t alignment) { +void* MallocAllocator::allocateBytesWithoutRetry( + uint64_t bytes, + uint16_t alignment) { if (!incrementUsage(bytes)) { return nullptr; } @@ -279,7 +300,7 @@ void* MallocAllocator::allocateBytes(uint64_t bytes, uint16_t alignment) { return result; } -void* MallocAllocator::allocateZeroFilled(uint64_t bytes) { +void* MallocAllocator::allocateZeroFilledWithoutRetry(uint64_t bytes) { if (!incrementUsage(bytes)) { return nullptr; } diff --git a/velox/common/memory/MallocAllocator.h b/velox/common/memory/MallocAllocator.h index 24503c4570a1..debc07cdc962 100644 --- a/velox/common/memory/MallocAllocator.h +++ b/velox/common/memory/MallocAllocator.h @@ -37,6 +37,17 @@ class MallocAllocator : public MemoryAllocator { } } + void registerCache(const std::shared_ptr& cache) override { + VELOX_CHECK_NULL(cache_); + VELOX_CHECK_NOT_NULL(cache); + VELOX_CHECK(cache->allocator() == this); + cache_ = cache; + } + + Cache* cache() const override { + return cache_.get(); + } + Kind kind() const override { return kind_; } @@ -45,42 +56,15 @@ class MallocAllocator : public MemoryAllocator { return capacity_; } - bool allocateNonContiguous( - MachinePageCount numPages, - Allocation& out, - ReservationCallback reservationCB = nullptr, - MachinePageCount minSizeClass = 0) override; + void freeContiguous(ContiguousAllocation& allocation) override; int64_t freeNonContiguous(Allocation& allocation) override; - bool allocateContiguous( - MachinePageCount numPages, - Allocation* collateral, - ContiguousAllocation& allocation, - ReservationCallback reservationCB = nullptr, - MachinePageCount maxPages = 0) override { - bool result; - stats_.recordAllocate(AllocationTraits::pageBytes(numPages), 1, [&]() { - result = allocateContiguousImpl( - numPages, collateral, allocation, reservationCB, maxPages); - }); - return result; - } - - void freeContiguous(ContiguousAllocation& allocation) override { - stats_.recordFree( - allocation.size(), [&]() { freeContiguousImpl(allocation); }); - } - - bool growContiguous( + bool growContiguousWithoutRetry( MachinePageCount increment, ContiguousAllocation& allocation, ReservationCallback reservationCB = nullptr) override; - void* allocateBytes(uint64_t bytes, uint16_t alignment) override; - - void* allocateZeroFilled(uint64_t bytes) override; - void freeBytes(void* p, uint64_t bytes) noexcept override; size_t totalUsedBytes() const override { @@ -95,15 +79,24 @@ class MallocAllocator : public MemoryAllocator { return numMapped_; } - Stats stats() const override { - return stats_; - } - bool checkConsistency() const override; std::string toString() const override; private: + bool allocateNonContiguousWithoutRetry( + MachinePageCount numPages, + Allocation& out, + ReservationCallback reservationCB = nullptr, + MachinePageCount minSizeClass = 0) override; + + bool allocateContiguousWithoutRetry( + MachinePageCount numPages, + Allocation* FOLLY_NULLABLE collateral, + ContiguousAllocation& allocation, + ReservationCallback reservationCB = nullptr, + MachinePageCount maxPages = 0) override; + bool allocateContiguousImpl( MachinePageCount numPages, Allocation* FOLLY_NULLABLE collateral, @@ -113,6 +106,10 @@ class MallocAllocator : public MemoryAllocator { void freeContiguousImpl(ContiguousAllocation& allocation); + void* allocateBytesWithoutRetry(uint64_t bytes, uint16_t alignment) override; + + void* allocateZeroFilledWithoutRetry(uint64_t bytes) override; + /// Increment current usage and check current allocator consistency to make /// sure current usage does not go above 'capacity_'. If it goes above /// 'capacity_', the increment will not be applied. Returns true if within @@ -161,6 +158,6 @@ class MallocAllocator : public MemoryAllocator { /// Tracks malloc'd pointers to detect bad frees. std::unordered_set mallocs_; - Stats stats_; + std::shared_ptr cache_; }; } // namespace facebook::velox::memory diff --git a/velox/common/memory/MemoryAllocator.cpp b/velox/common/memory/MemoryAllocator.cpp index ccdbb4c448d9..a4a684e3384b 100644 --- a/velox/common/memory/MemoryAllocator.cpp +++ b/velox/common/memory/MemoryAllocator.cpp @@ -160,7 +160,74 @@ MachinePageCount MemoryAllocator::roundUpToSizeClassSize( return *std::lower_bound(sizes.begin(), sizes.end(), pages); } +bool MemoryAllocator::allocateNonContiguous( + MachinePageCount numPages, + Allocation& out, + ReservationCallback reservationCB, + MachinePageCount minSizeClass) { + if (cache() == nullptr) { + return allocateNonContiguousWithoutRetry( + numPages, out, reservationCB, minSizeClass); + } + return cache()->makeSpace(numPages, [&]() { + return allocateNonContiguousWithoutRetry( + numPages, out, reservationCB, minSizeClass); + }); +} + +bool MemoryAllocator::allocateContiguous( + MachinePageCount numPages, + Allocation* collateral, + ContiguousAllocation& allocation, + ReservationCallback reservationCB, + MachinePageCount maxPages) { + if (cache() == nullptr) { + return allocateContiguousWithoutRetry( + numPages, collateral, allocation, reservationCB, maxPages); + } + return cache()->makeSpace(numPages, [&]() { + return allocateContiguousWithoutRetry( + numPages, collateral, allocation, reservationCB, maxPages); + }); +} + +bool MemoryAllocator::growContiguous( + MachinePageCount increment, + ContiguousAllocation& allocation, + ReservationCallback reservationCB) { + if (cache() == nullptr) { + return growContiguousWithoutRetry(increment, allocation, reservationCB); + } + return cache()->makeSpace(increment, [&]() { + return growContiguousWithoutRetry(increment, allocation, reservationCB); + }); +} + +void* MemoryAllocator::allocateBytes(uint64_t bytes, uint16_t alignment) { + if (cache() == nullptr) { + return allocateBytesWithoutRetry(bytes, alignment); + } + void* result = nullptr; + cache()->makeSpace(AllocationTraits::numPages(bytes), [&]() { + result = allocateBytesWithoutRetry(bytes, alignment); + return result != nullptr; + }); + return result; +} + void* MemoryAllocator::allocateZeroFilled(uint64_t bytes) { + if (cache() == nullptr) { + return allocateZeroFilledWithoutRetry(bytes); + } + void* result = nullptr; + cache()->makeSpace(AllocationTraits::numPages(bytes), [&]() { + result = allocateZeroFilledWithoutRetry(bytes); + return result != nullptr; + }); + return result; +} + +void* MemoryAllocator::allocateZeroFilledWithoutRetry(uint64_t bytes) { void* result = allocateBytes(bytes); if (result != nullptr) { ::memset(result, 0, bytes); diff --git a/velox/common/memory/MemoryAllocator.h b/velox/common/memory/MemoryAllocator.h index 6cd0c76d6a6b..cae015620256 100644 --- a/velox/common/memory/MemoryAllocator.h +++ b/velox/common/memory/MemoryAllocator.h @@ -139,6 +139,24 @@ struct Stats { int64_t numAdvise{0}; }; +class MemoryAllocator; + +/// A general cache interface using 'MemroyAllocator' to allocate memory, that +/// is also able to free up memory upon request by shrinking itself. +class Cache { + public: + virtual ~Cache() = default; + /// This method should be implemented so that it tries to accommodate the + /// passed in 'allocate' by freeing up space from 'this' if needed. 'numPages' + /// is the number of pages 'allocate' tries to allocate.It should return true + /// if 'allocate' succeeds, and false otherwise. + virtual bool makeSpace( + memory::MachinePageCount numPages, + std::function allocate) = 0; + + virtual MemoryAllocator* allocator() const = 0; +}; + /// This class provides interface for the actual memory allocations from memory /// pool. It allocates runs of machine pages from predefined size classes, and /// supports both contiguous and non-contiguous memory allocations. An @@ -193,6 +211,11 @@ class MemoryAllocator : public std::enable_shared_from_this { /// the kind of the delegated memory allocator underneath. virtual Kind kind() const = 0; + /// Registers a 'Cache' that is used for freeing up space when this allocator + /// is under memory pressure. The allocator of registered 'Cache' needs to be + /// the same as 'this'. + virtual void registerCache(const std::shared_ptr& cache) = 0; + using ReservationCallback = std::function; /// Returns the capacity of the allocator in bytes. @@ -210,18 +233,20 @@ class MemoryAllocator : public std::enable_shared_from_this { /// 'reservationCB' before the actual memory allocation so it needs to release /// the reservation if the actual allocation fails halfway. The function /// returns true if the allocation succeeded. If it returns false, 'out' - /// references no memory and any partially allocated memory is freed. + /// references no memory and any partially allocated memory is freed. The + /// function might retry allocation failure by making space from 'cache()' if + /// registered. But sufficient space is not guaranteed. /// /// NOTE: /// - 'out' is guaranteed to be freed if it's not empty. /// - Allocation is not guaranteed even if collateral 'out' is larger than /// 'numPages', because this method is not atomic. /// - Throws if allocation exceeds capacity. - virtual bool allocateNonContiguous( + bool allocateNonContiguous( MachinePageCount numPages, Allocation& out, ReservationCallback reservationCB = nullptr, - MachinePageCount minSizeClass = 0) = 0; + MachinePageCount minSizeClass = 0); /// Frees non-contiguous 'allocation'. 'allocation' is empty on return. The /// function returns the actual freed bytes. @@ -241,19 +266,18 @@ class MemoryAllocator : public std::enable_shared_from_this { /// cleared. /// /// NOTE: - 'collateral' and passed in 'allocation' are guaranteed - /// to be freed. If 'maxPages' is non-0, 'maxPages' worth of - /// address space is mapped but the utilization in the allocator and - /// pool is incremented by 'numPages'. This allows reserving - /// a large range of addresses for use with huge pages without - /// declaring the whole range as held by the query. The reservation - /// will be increased as and if addresses in the range are used. See - /// growContiguous(). - virtual bool allocateContiguous( + /// to be freed. If 'maxPages' is non-0, 'maxPages' worth of address space is + /// mapped but the utilization in the allocator and pool is incremented by + /// 'numPages'. This allows reserving a large range of addresses for use with + /// huge pages without declaring the whole range as held by the query. The + /// reservation will be increased as and if addresses in the range are used. + /// See growContiguous(). + bool allocateContiguous( MachinePageCount numPages, Allocation* collateral, ContiguousAllocation& allocation, ReservationCallback reservationCB = nullptr, - MachinePageCount maxPages = 0) = 0; + MachinePageCount maxPages = 0); /// Frees contiguous 'allocation'. 'allocation' is empty on return. virtual void freeContiguous(ContiguousAllocation& allocation) = 0; @@ -262,24 +286,26 @@ class MemoryAllocator : public std::enable_shared_from_this { /// 'increment'. false if would exceed capacity, Throws if size /// would exceed maxSize given in allocateContiguous(). Calls reservationCB /// before increasing the utilization and returns false with no effect if this - /// fails. - virtual bool growContiguous( + /// fails. The function might retry allocation failure by making + /// space from 'cache()' if registered. But sufficient space is not guaranteed + bool growContiguous( MachinePageCount increment, ContiguousAllocation& allocation, - ReservationCallback reservationCB = nullptr) = 0; + ReservationCallback reservationCB = nullptr); /// Allocates contiguous 'bytes' and return the first byte. Returns nullptr if - /// there is no space. + /// there is no space. The function might retry allocation failure by making + /// space from 'cache()' if registered. But sufficient space is not + /// guaranteed. /// /// NOTE: 'alignment' must be power of two and in range of /// [kMinAlignment, kMaxAlignment]. - virtual void* allocateBytes( - uint64_t bytes, - uint16_t alignment = kMinAlignment) = 0; + void* allocateBytes(uint64_t bytes, uint16_t alignment = kMinAlignment); /// Allocates a zero-filled contiguous bytes. Returns nullptr if there is no - /// space - virtual void* allocateZeroFilled(uint64_t bytes); + /// space. The function might retry allocation failure by making space from + /// 'cache()' if registered. But sufficient space is not guaranteed. + void* allocateZeroFilled(uint64_t bytes); /// Frees contiguous memory allocated by allocateBytes, allocateZeroFilled, /// reallocateBytes. @@ -306,7 +332,7 @@ class MemoryAllocator : public std::enable_shared_from_this { virtual MachinePageCount numMapped() const = 0; virtual Stats stats() const { - return Stats(); + return stats_; } virtual std::string toString() const = 0; @@ -356,7 +382,38 @@ class MemoryAllocator : public std::enable_shared_from_this { } protected: - MemoryAllocator() = default; + explicit MemoryAllocator() = default; + + /// The actual memory allocation function implementation without retry + /// attempts by making space from cache. + virtual bool allocateContiguousWithoutRetry( + MachinePageCount numPages, + Allocation* collateral, + ContiguousAllocation& allocation, + ReservationCallback reservationCB = nullptr, + MachinePageCount maxPages = 0) = 0; + + virtual bool allocateNonContiguousWithoutRetry( + MachinePageCount numPages, + Allocation& out, + ReservationCallback reservationCB, + MachinePageCount minSizeClass) = 0; + + virtual void* allocateBytesWithoutRetry( + uint64_t bytes, + uint16_t alignment) = 0; + + virtual void* allocateZeroFilledWithoutRetry(uint64_t bytes); + + virtual bool growContiguousWithoutRetry( + MachinePageCount increment, + ContiguousAllocation& allocation, + ReservationCallback reservationCB = nullptr) = 0; + + // 'Cache' getter. The cache is only responsible for freeing up memory space + // by shrinking itself when there is not enough space upon allocating. The + // free of space is not guaranteed. + virtual Cache* cache() const = 0; // Returns the size class size that corresponds to 'bytes'. static MachinePageCount roundUpToSizeClassSize( @@ -422,6 +479,8 @@ class MemoryAllocator : public std::enable_shared_from_this { InjectedFailure injectedFailure_{InjectedFailure::kNone}; bool isPersistentFailureInjection_{false}; + Stats stats_; + private: static std::mutex initMutex_; // Singleton instance. diff --git a/velox/common/memory/MmapAllocator.cpp b/velox/common/memory/MmapAllocator.cpp index 8da35bc5bd5a..3964b2268cb5 100644 --- a/velox/common/memory/MmapAllocator.cpp +++ b/velox/common/memory/MmapAllocator.cpp @@ -51,7 +51,7 @@ MmapAllocator::~MmapAllocator() { (numAllocated_ == 0) && (numExternalMapped_ == 0), "{}", toString()); } -bool MmapAllocator::allocateNonContiguous( +bool MmapAllocator::allocateNonContiguousWithoutRetry( MachinePageCount numPages, Allocation& out, ReservationCallback reservationCB, @@ -211,6 +211,20 @@ MachinePageCount MmapAllocator::freeInternal(Allocation& allocation) { return numFreed; } +bool MmapAllocator::allocateContiguousWithoutRetry( + MachinePageCount numPages, + Allocation* collateral, + ContiguousAllocation& allocation, + ReservationCallback reservationCB, + MachinePageCount maxPages) { + bool result; + stats_.recordAllocate(AllocationTraits::pageBytes(numPages), 1, [&]() { + result = allocateContiguousImpl( + numPages, collateral, allocation, reservationCB, maxPages); + }); + return result; +} + bool MmapAllocator::allocateContiguousImpl( MachinePageCount numPages, Allocation* collateral, @@ -364,6 +378,11 @@ bool MmapAllocator::allocateContiguousImpl( return true; } +void MmapAllocator::freeContiguous(ContiguousAllocation& allocation) { + stats_.recordFree( + allocation.size(), [&]() { freeContiguousImpl(allocation); }); +} + void MmapAllocator::freeContiguousImpl(ContiguousAllocation& allocation) { if (allocation.empty()) { return; @@ -384,7 +403,7 @@ void MmapAllocator::freeContiguousImpl(ContiguousAllocation& allocation) { allocation.clear(); } -bool MmapAllocator::growContiguous( +bool MmapAllocator::growContiguousWithoutRetry( MachinePageCount increment, ContiguousAllocation& allocation, ReservationCallback reservationCB) { @@ -430,7 +449,9 @@ bool MmapAllocator::growContiguous( return true; } -void* MmapAllocator::allocateBytes(uint64_t bytes, uint16_t alignment) { +void* MmapAllocator::allocateBytesWithoutRetry( + uint64_t bytes, + uint16_t alignment) { alignmentCheck(bytes, alignment); if (useMalloc(bytes)) { @@ -448,7 +469,8 @@ void* MmapAllocator::allocateBytes(uint64_t bytes, uint16_t alignment) { if (bytes <= AllocationTraits::pageBytes(sizeClassSizes_.back())) { Allocation allocation; const auto numPages = roundUpToSizeClassSize(bytes, sizeClassSizes_); - if (!allocateNonContiguous(numPages, allocation, nullptr, numPages)) { + if (!allocateNonContiguousWithoutRetry( + numPages, allocation, nullptr, numPages)) { return nullptr; } auto run = allocation.runAt(0); @@ -463,7 +485,7 @@ void* MmapAllocator::allocateBytes(uint64_t bytes, uint16_t alignment) { ContiguousAllocation allocation; auto numPages = bits::roundUp(bytes, AllocationTraits::kPageSize) / AllocationTraits::kPageSize; - if (!allocateContiguous(numPages, nullptr, allocation)) { + if (!allocateContiguousWithoutRetry(numPages, nullptr, allocation)) { return nullptr; } diff --git a/velox/common/memory/MmapAllocator.h b/velox/common/memory/MmapAllocator.h index 0ecc872e17c9..43863af9f535 100644 --- a/velox/common/memory/MmapAllocator.h +++ b/velox/common/memory/MmapAllocator.h @@ -85,56 +85,29 @@ class MmapAllocator : public MemoryAllocator { return kind_; } - size_t capacity() const override { - return AllocationTraits::pageBytes(capacity_); + void registerCache(const std::shared_ptr& cache) override { + VELOX_CHECK_NULL(cache_); + VELOX_CHECK_NOT_NULL(cache); + VELOX_CHECK(cache->allocator() == this); + cache_ = cache; } - bool allocateNonContiguous( - MachinePageCount numPages, - Allocation& out, - ReservationCallback reservationCB = nullptr, - MachinePageCount minSizeClass = 0) override; - - int64_t freeNonContiguous(Allocation& allocation) override; - - bool allocateContiguous( - MachinePageCount numPages, - Allocation* collateral, - ContiguousAllocation& allocation, - ReservationCallback reservationCB = nullptr, - MachinePageCount maxPages = 0) override { - bool result; - stats_.recordAllocate(numPages * AllocationTraits::kPageSize, 1, [&]() { - result = allocateContiguousImpl( - numPages, collateral, allocation, reservationCB, maxPages); - }); - return result; + Cache* cache() const override { + return cache_.get(); } - void freeContiguous(ContiguousAllocation& allocation) override { - stats_.recordFree( - allocation.size(), [&]() { freeContiguousImpl(allocation); }); + size_t capacity() const override { + return AllocationTraits::pageBytes(capacity_); } - bool growContiguous( + bool growContiguousWithoutRetry( MachinePageCount increment, ContiguousAllocation& allocation, ReservationCallback reservationCB = nullptr) override; - /// Allocates 'bytes' contiguous bytes and returns the pointer to the first - /// byte. If 'bytes' is less than 'maxMallocBytes_', delegates the allocation - /// to malloc. If the size is above that and below the largest size classes' - /// size, allocates one element of the next size classes' size. If 'size' is - /// greater than the largest size classes' size, calls allocateContiguous(). - /// Returns nullptr if there is no space. The amount to allocate is subject to - /// the size limit of 'this'. This function is not virtual but calls the - /// virtual functions allocateNonContiguous and allocateContiguous, which can - /// track sizes and enforce caps etc. If 'alignment' is not kMinAlignment, - /// then 'bytes' must be a multiple of 'alignment'. - /// - /// NOTE: 'alignment' must be power of two and in range of [kMinAlignment, - /// kMaxAlignment]. - void* allocateBytes(uint64_t bytes, uint16_t alignment) override; + void freeContiguous(ContiguousAllocation& allocation) override; + + int64_t freeNonContiguous(Allocation& allocation) override; void freeBytes(void* p, uint64_t bytes) noexcept override; @@ -339,6 +312,19 @@ class MmapAllocator : public MemoryAllocator { uint64_t numAdvisedAway_ = 0; }; + bool allocateNonContiguousWithoutRetry( + MachinePageCount numPages, + Allocation& out, + ReservationCallback reservationCB = nullptr, + MachinePageCount minSizeClass = 0) override; + + bool allocateContiguousWithoutRetry( + MachinePageCount numPages, + Allocation* collateral, + ContiguousAllocation& allocation, + ReservationCallback reservationCB = nullptr, + MachinePageCount maxPages = 0) override; + bool allocateContiguousImpl( MachinePageCount numPages, Allocation* collateral, @@ -348,6 +334,21 @@ class MmapAllocator : public MemoryAllocator { void freeContiguousImpl(ContiguousAllocation& allocation); + // Allocates 'bytes' contiguous bytes and returns the pointer to the first + // byte. If 'bytes' is less than 'maxMallocBytes_', delegates the allocation + // to malloc. If the size is above that and below the largest size classes' + // size, allocates one element of the next size classes' size. If 'size' is + // greater than the largest size classes' size, calls allocateContiguous(). + // Returns nullptr if there is no space. The amount to allocate is subject to + // the size limit of 'this'. This function is not virtual but calls the + // virtual functions allocateNonContiguous and allocateContiguous, which can + // track sizes and enforce caps etc. If 'alignment' is not kMinAlignment, + // then 'bytes' must be a multiple of 'alignment'. + // + // NOTE: 'alignment' must be power of two and in range of [kMinAlignment, + // kMaxAlignment]. + void* allocateBytesWithoutRetry(uint64_t bytes, uint16_t alignment) override; + // Ensures that there are at least 'newMappedNeeded' pages that are // not backing any existing allocation. If capacity_ - numMapped_ < // newMappedNeeded, advises away enough pages backing freed slots in @@ -418,7 +419,7 @@ class MmapAllocator : public MemoryAllocator { std::mutex arenaMutex_; std::unique_ptr managedArenas_; - Stats stats_; + std::shared_ptr cache_; }; } // namespace facebook::velox::memory diff --git a/velox/common/memory/tests/MemoryPoolTest.cpp b/velox/common/memory/tests/MemoryPoolTest.cpp index e2e71c3496c7..eabc39a3a958 100644 --- a/velox/common/memory/tests/MemoryPoolTest.cpp +++ b/velox/common/memory/tests/MemoryPoolTest.cpp @@ -88,18 +88,16 @@ class MemoryPoolTest : public testing::TestWithParam { MmapAllocator::Options opts{8UL << 30}; allocator_ = std::make_shared(opts); if (useCache_) { - cache_ = - std::make_shared(allocator_, kCapacity, nullptr); - MemoryAllocator::setDefaultInstance(cache_.get()); + cache_ = AsyncDataCache::create(allocator_.get()); + MemoryAllocator::setDefaultInstance(allocator_.get()); } else { MemoryAllocator::setDefaultInstance(allocator_.get()); } } else { allocator_ = MemoryAllocator::createDefaultInstance(); if (useCache_) { - cache_ = - std::make_shared(allocator_, kCapacity, nullptr); - MemoryAllocator::setDefaultInstance(cache_.get()); + cache_ = AsyncDataCache::create(allocator_.get()); + MemoryAllocator::setDefaultInstance(allocator_.get()); } else { MemoryAllocator::setDefaultInstance(allocator_.get()); } @@ -111,6 +109,9 @@ class MemoryPoolTest : public testing::TestWithParam { } void TearDown() override { + if (useCache_) { + cache_->prepareShutdown(); + } allocator_->testingClearFailureInjection(); MmapAllocator::setDefaultInstance(nullptr); } diff --git a/velox/common/memory/tests/SharedArbitratorTest.cpp b/velox/common/memory/tests/SharedArbitratorTest.cpp index c4a04ed84f7c..660974cd5d97 100644 --- a/velox/common/memory/tests/SharedArbitratorTest.cpp +++ b/velox/common/memory/tests/SharedArbitratorTest.cpp @@ -27,6 +27,7 @@ #include "velox/common/memory/MemoryArbitrator.h" #include "velox/common/memory/SharedArbitrator.h" #include "velox/common/testutil/TestValue.h" +#include "velox/exec/HashTable.h" #include "velox/exec/tests/utils/AssertQueryBuilder.h" #include "velox/exec/tests/utils/HiveConnectorTestBase.h" #include "velox/exec/tests/utils/PlanBuilder.h" @@ -311,7 +312,7 @@ class SharedArbitrationTest : public exec::test::HiveConnectorTestBase { executor_.get(), std::unordered_map{}, configs, - memory::MemoryAllocator::getInstance(), + cache::AsyncDataCache::getInstance(), std::move(pool)); return queryCtx; } @@ -1584,113 +1585,43 @@ DEBUG_ONLY_TEST_F( createDuckDbTable(vectors); std::shared_ptr joinQueryCtx = newQueryCtx(kMemoryCapacity); - std::shared_ptr fakeQueryCtx = newQueryCtx(kMemoryCapacity); - // Set fake operator to reclaimable to allow arbitration to succeed. - fakeOperatorFactory_->setCanReclaim(true); - - folly::EventCount waitForPrepareJoin; - folly::EventCount waitForFakeAllocationDone; - std::atomic fakeAllocationDone{false}; - std::atomic startPrepareJoin{false}; - fakeOperatorFactory_->setAllocationCallback([&](Operator* op) { - if (fakeAllocationDone) { - return Allocation{}; - } - - // Wait for hash join build to start table build at the end of hash build - // phase. - waitForPrepareJoin.await([&]() { return startPrepareJoin.load(); }); - - // Set to allocate all the remaining free memory from the arbitrator. - const auto allocationSize = - kMemoryCapacity - joinQueryCtx->pool()->currentBytes(); - auto buffer = op->pool()->allocate(allocationSize); - // Unblock table build and expect any memory allocation by parallel table - // build to trigger memory arbitration. - fakeAllocationDone = true; - waitForFakeAllocationDone.notifyAll(); - return Allocation{op->pool(), buffer, allocationSize}; - }); - - std::vector extraAllocations; + // Make sure the parallel build has been triggered. + std::atomic parallelBuildTriggered{false}; SCOPED_TESTVALUE_SET( - "facebook::velox::exec::HashBuild::prepareJoinTable", - std::function*)>( - ([&](std::vector* buildOps) { - // Free up the unused memory reservations from all the hash build - // memory pool to ensure triggering memory arbitration in parallel - // build. - for (auto* op : *buildOps) { - const size_t allocationSize = op->pool()->availableReservation(); - if (allocationSize > 0) { - extraAllocations.push_back(Allocation{ - op->pool(), - op->pool()->allocate(allocationSize), - allocationSize}); - } - } - // Unblock fake memory allocation to allocate all the freed memory - // from arbitrator. - startPrepareJoin = true; - waitForPrepareJoin.notifyAll(); - // Wait for the fake memory allocation to complete before - // proceeding with the parallel build. - waitForFakeAllocationDone.await( - [&]() { return fakeAllocationDone.load(); }); - }))); - - std::thread joinThread([&]() { - auto planNodeIdGenerator = std::make_shared(); - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - // Set very low table size threshold to trigger parallel build. - .config( - core::QueryConfig::kMinTableRowsForParallelJoinBuild, - std::to_string(0)) - // Set multiple hash build drivers to trigger parallel build. - .maxDrivers(4) - .queryCtx(joinQueryCtx) - .plan(PlanBuilder(planNodeIdGenerator) - .values(vectors, true) - .project({"c0 AS t0", "c1 AS t1", "c2 AS t2"}) - .hashJoin( - {"t0", "t1"}, - {"u1", "u0"}, - PlanBuilder(planNodeIdGenerator) - .values(vectors, true) - .project({"c0 AS u0", "c1 AS u1", "c2 AS u2"}) - .planNode(), - "", - {"t1"}, - core::JoinType::kInner) - .planNode()) - .assertResults( - "SELECT t.c1 FROM tmp as t, tmp AS u WHERE t.c0 == u.c1 AND t.c1 == u.c0"); + "facebook::velox::exec::HashTable::parallelJoinBuild", + std::function( + [&](void*) { parallelBuildTriggered = true; })); - // Free up the extra memory allocations. - for (auto& allocation : extraAllocations) { - allocation.free(); - } - extraAllocations.clear(); - }); + // TODO: add driver context to test if the memory allocation is triggered in + // driver context or not. - std::shared_ptr fakeMemoryTask; - std::thread memThread([&]() { - fakeMemoryTask = - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(fakeQueryCtx) - .plan(PlanBuilder() - .values(vectors) - .addNode([&](std::string id, core::PlanNodePtr input) { - return std::make_shared(id, input); - }) - .planNode()) - .assertResults("SELECT * FROM tmp"); - }); - joinThread.join(); - memThread.join(); - fakeMemoryTask.reset(); + auto planNodeIdGenerator = std::make_shared(); + AssertQueryBuilder(duckDbQueryRunner_) + // Set very low table size threshold to trigger parallel build. + .config( + core::QueryConfig::kMinTableRowsForParallelJoinBuild, + std::to_string(0)) + // Set multiple hash build drivers to trigger parallel build. + .maxDrivers(4) + .queryCtx(joinQueryCtx) + .plan(PlanBuilder(planNodeIdGenerator) + .values(vectors, true) + .project({"c0 AS t0", "c1 AS t1", "c2 AS t2"}) + .hashJoin( + {"t0", "t1"}, + {"u1", "u0"}, + PlanBuilder(planNodeIdGenerator) + .values(vectors, true) + .project({"c0 AS u0", "c1 AS u1", "c2 AS u2"}) + .planNode(), + "", + {"t1"}, + core::JoinType::kInner) + .planNode()) + .assertResults( + "SELECT t.c1 FROM tmp as t, tmp AS u WHERE t.c0 == u.c1 AND t.c1 == u.c0"); + ASSERT_TRUE(parallelBuildTriggered); Task::testingWaitForAllTasksToBeDeleted(); } diff --git a/velox/connectors/Connector.h b/velox/connectors/Connector.h index e11d5f600892..b8fa93f191dc 100644 --- a/velox/connectors/Connector.h +++ b/velox/connectors/Connector.h @@ -17,6 +17,7 @@ #include "velox/common/base/AsyncSource.h" #include "velox/common/base/RuntimeMetrics.h" +#include "velox/common/caching/AsyncDataCache.h" #include "velox/common/caching/ScanTracker.h" #include "velox/common/future/VeloxPromise.h" #include "velox/core/ExpressionEvaluator.h" @@ -223,7 +224,7 @@ class ConnectorQueryCtx { memory::MemoryPool* connectorPool, const Config* connectorConfig, std::unique_ptr expressionEvaluator, - memory::MemoryAllocator* FOLLY_NONNULL allocator, + cache::AsyncDataCache* cache, const std::string& queryId, const std::string& taskId, const std::string& planNodeId, @@ -232,7 +233,7 @@ class ConnectorQueryCtx { connectorPool_(connectorPool), config_(connectorConfig), expressionEvaluator_(std::move(expressionEvaluator)), - allocator_(allocator), + cache_(cache), scanId_(fmt::format("{}.{}", taskId, planNodeId)), queryId_(queryId), taskId_(taskId), @@ -260,10 +261,8 @@ class ConnectorQueryCtx { return expressionEvaluator_.get(); } - // MemoryAllocator for large allocations. Used for caching with - // CachedBufferedImput if this implements cache::AsyncDataCache. - memory::MemoryAllocator* FOLLY_NONNULL allocator() const { - return allocator_; + cache::AsyncDataCache* cache() const { + return cache_; } // This is a combination of task id and the scan's PlanNodeId. This is an id @@ -295,7 +294,7 @@ class ConnectorQueryCtx { memory::MemoryPool* connectorPool_; const Config* FOLLY_NONNULL config_; std::unique_ptr expressionEvaluator_; - memory::MemoryAllocator* FOLLY_NONNULL allocator_; + cache::AsyncDataCache* cache_; const std::string scanId_; const std::string queryId_; const std::string taskId_; diff --git a/velox/connectors/hive/HiveConfig.cpp b/velox/connectors/hive/HiveConfig.cpp index 534f7887371a..cc7d1aa6564f 100644 --- a/velox/connectors/hive/HiveConfig.cpp +++ b/velox/connectors/hive/HiveConfig.cpp @@ -155,4 +155,9 @@ int32_t HiveConfig::maxCoalescedDistanceBytes(const Config* config) { return config->get(kMaxCoalescedDistanceBytes, 512 << 10); } +// static. +int32_t HiveConfig::numCacheFileHandles(const Config* config) { + return config->get(kNumCacheFileHandles, 20'000); +} + } // namespace facebook::velox::connector::hive diff --git a/velox/connectors/hive/HiveConfig.h b/velox/connectors/hive/HiveConfig.h index 131ce587d36d..b792bdbec8eb 100644 --- a/velox/connectors/hive/HiveConfig.h +++ b/velox/connectors/hive/HiveConfig.h @@ -100,6 +100,9 @@ class HiveConfig { static constexpr const char* kMaxCoalescedDistanceBytes = "max-coalesced-distance-bytes"; + /// Maximum number of entries in the file handle cache. + static constexpr const char* kNumCacheFileHandles = "num_cached_file_handles"; + static InsertExistingPartitionsBehavior insertExistingPartitionsBehavior( const Config* config); @@ -136,6 +139,8 @@ class HiveConfig { static int64_t maxCoalescedBytes(const Config* config); static int32_t maxCoalescedDistanceBytes(const Config* config); + + static int32_t numCacheFileHandles(const Config* config); }; } // namespace facebook::velox::connector::hive diff --git a/velox/connectors/hive/HiveConnector.cpp b/velox/connectors/hive/HiveConnector.cpp index 4b245722ad85..1287ba2d3826 100644 --- a/velox/connectors/hive/HiveConnector.cpp +++ b/velox/connectors/hive/HiveConnector.cpp @@ -43,13 +43,12 @@ using namespace facebook::velox::exec; using namespace facebook::velox::dwrf; -DEFINE_int32( - num_file_handle_cache, - 20'000, - "Max number of file handles to cache."); - namespace facebook::velox::connector::hive { +int32_t numCachedFileHandles(const Config* properties) { + return properties ? HiveConfig::numCacheFileHandles(properties) : 20'000; +} + HiveConnector::HiveConnector( const std::string& id, std::shared_ptr properties, @@ -58,10 +57,13 @@ HiveConnector::HiveConnector( fileHandleFactory_( std::make_unique< SimpleLRUCache>>( - FLAGS_num_file_handle_cache), - std::make_unique(std::move(properties))), - executor_(executor) {} - + numCachedFileHandles(properties.get())), + std::make_unique(properties)), + executor_(executor) { + LOG(INFO) << "Hive connector " << connectorId() << " created with maximum of " + << numCachedFileHandles(properties.get()) + << " cached file handles."; +} std::unique_ptr HivePartitionFunctionSpec::create( int numPartitions) const { std::vector bucketToPartitions; diff --git a/velox/connectors/hive/HiveConnector.h b/velox/connectors/hive/HiveConnector.h index 5970b9df80c1..795c0f9845ff 100644 --- a/velox/connectors/hive/HiveConnector.h +++ b/velox/connectors/hive/HiveConnector.h @@ -46,16 +46,17 @@ class HiveConnector : public Connector { HiveConfig::maxCoalescedBytes(connectorQueryCtx->config())); options.setMaxCoalesceDistance( HiveConfig::maxCoalescedDistanceBytes(connectorQueryCtx->config())); + options.setFileColumnNamesReadAsLowerCase( + HiveConfig::isFileColumnNamesReadAsLowerCase( + connectorQueryCtx->config())); return std::make_unique( outputType, tableHandle, columnHandles, &fileHandleFactory_, connectorQueryCtx->expressionEvaluator(), - connectorQueryCtx->allocator(), + connectorQueryCtx->cache(), connectorQueryCtx->scanId(), - HiveConfig::isFileColumnNamesReadAsLowerCase( - connectorQueryCtx->config()), executor_, options); } diff --git a/velox/connectors/hive/HiveDataSource.cpp b/velox/connectors/hive/HiveDataSource.cpp index 74b9f56f42d7..46de1360b58d 100644 --- a/velox/connectors/hive/HiveDataSource.cpp +++ b/velox/connectors/hive/HiveDataSource.cpp @@ -63,25 +63,39 @@ bool applyPartitionFilter( } } +struct SubfieldSpec { + const common::Subfield* subfield; + bool filterOnly; +}; + +template +void deduplicate(std::vector& values) { + std::sort(values.begin(), values.end()); + values.erase(std::unique(values.begin(), values.end()), values.end()); +} + // Recursively add subfields to scan spec. void addSubfields( const Type& type, - const std::vector& subfields, + std::vector& subfields, int level, memory::MemoryPool* pool, common::ScanSpec& spec) { - for (auto& subfield : subfields) { - if (level == subfield->path().size()) { + int newSize = 0; + for (int i = 0; i < subfields.size(); ++i) { + if (level < subfields[i].subfield->path().size()) { + subfields[newSize++] = subfields[i]; + } else if (!subfields[i].filterOnly) { spec.addAllChildFields(type); return; } } + subfields.resize(newSize); switch (type.kind()) { case TypeKind::ROW: { - folly::F14FastMap> - required; + folly::F14FastMap> required; for (auto& subfield : subfields) { - auto* element = subfield->path()[level].get(); + auto* element = subfield.subfield->path()[level].get(); auto* nestedField = dynamic_cast(element); VELOX_CHECK( @@ -114,11 +128,14 @@ void addSubfields( level + 1, pool, *spec.addMapValueField()); + if (subfields.empty()) { + return; + } bool stringKey = keyType->isVarchar() || keyType->isVarbinary(); std::vector stringSubscripts; std::vector longSubscripts; for (auto& subfield : subfields) { - auto* element = subfield->path()[level].get(); + auto* element = subfield.subfield->path()[level].get(); if (dynamic_cast(element)) { return; } @@ -142,8 +159,10 @@ void addSubfields( } std::unique_ptr filter; if (stringKey) { + deduplicate(stringSubscripts); filter = std::make_unique(stringSubscripts, false); } else { + deduplicate(longSubscripts); filter = common::createBigintValues(longSubscripts, false); } keys->setFilter(std::move(filter)); @@ -156,10 +175,13 @@ void addSubfields( level + 1, pool, *spec.addArrayElementField()); + if (subfields.empty()) { + return; + } constexpr long kMaxIndex = std::numeric_limits::max(); long maxIndex = -1; for (auto& subfield : subfields) { - auto* element = subfield->path()[level].get(); + auto* element = subfield.subfield->path()[level].get(); if (dynamic_cast(element)) { return; } @@ -175,7 +197,7 @@ void addSubfields( break; } default: - VELOX_FAIL("Subfields pruning not supported on type {}", type.toString()); + break; } } @@ -292,6 +314,9 @@ void checkColumnNameLowerCase(const SubfieldFilters& filters) { } void checkColumnNameLowerCase(const core::TypedExprPtr& typeExpr) { + if (typeExpr == nullptr) { + return; + } checkColumnNameLowerCase(typeExpr->type()); for (auto& type : typeExpr->inputs()) { checkColumnNameLowerCase(type); @@ -358,9 +383,8 @@ HiveDataSource::HiveDataSource( std::shared_ptr>& columnHandles, FileHandleFactory* fileHandleFactory, core::ExpressionEvaluator* expressionEvaluator, - memory::MemoryAllocator* allocator, + cache::AsyncDataCache* cache, const std::string& scanId, - bool fileColumnNamesReadAsLowerCase, folly::Executor* executor, const dwio::common::ReaderOptions& options) : fileHandleFactory_(fileHandleFactory), @@ -368,7 +392,7 @@ HiveDataSource::HiveDataSource( pool_(&options.getMemoryPool()), outputType_(outputType), expressionEvaluator_(expressionEvaluator), - allocator_(allocator), + cache_(cache), scanId_(scanId), executor_(executor) { // Column handled keyed on the column alias, the name used in the query. @@ -405,7 +429,7 @@ HiveDataSource::HiveDataSource( VELOX_CHECK( hiveTableHandle != nullptr, "TableHandle must be an instance of HiveTableHandle"); - if (fileColumnNamesReadAsLowerCase) { + if (readerOpts_.isFileColumnNamesReadAsLowerCase()) { checkColumnNameLowerCase(outputType); checkColumnNameLowerCase(hiveTableHandle->subfieldFilters()); checkColumnNameLowerCase(hiveTableHandle->remainingFilter()); @@ -796,22 +820,22 @@ std::shared_ptr HiveDataSource::makeScanSpec( spec->addFieldRecursively(name, *type, i); continue; } - std::vector subfieldPtrs; + std::vector subfieldSpecs; for (auto& subfield : subfields) { VELOX_CHECK_GT(subfield.path().size(), 0); auto* field = dynamic_cast( subfield.path()[0].get()); VELOX_CHECK(field); VELOX_CHECK_EQ(field->name(), name); - subfieldPtrs.push_back(&subfield); + subfieldSpecs.push_back({&subfield, false}); } if (auto it = requiredSubfieldsInFilters.find(name); it != requiredSubfieldsInFilters.end()) { for (auto* subfield : it->second) { - subfieldPtrs.push_back(subfield); + subfieldSpecs.push_back({subfield, true}); } } - addSubfields(*type, subfieldPtrs, 1, pool, *spec->addField(name, i)); + addSubfields(*type, subfieldSpecs, 1, pool, *spec->addField(name, i)); } for (auto& pair : filters) { @@ -835,12 +859,12 @@ std::unique_ptr HiveDataSource::createBufferedInput( const FileHandle& fileHandle, const dwio::common::ReaderOptions& readerOpts) { - if (auto* asyncCache = dynamic_cast(allocator_)) { + if (cache_) { return std::make_unique( fileHandle.file, dwio::common::MetricsLog::voidLog(), fileHandle.uuid.id(), - asyncCache, + cache_, Connector::getTracker(scanId_, readerOpts.loadQuantum()), fileHandle.groupId.id(), ioStats_, diff --git a/velox/connectors/hive/HiveDataSource.h b/velox/connectors/hive/HiveDataSource.h index c5db04e897c2..74e93195e68b 100644 --- a/velox/connectors/hive/HiveDataSource.h +++ b/velox/connectors/hive/HiveDataSource.h @@ -37,9 +37,8 @@ class HiveDataSource : public DataSource { std::shared_ptr>& columnHandles, FileHandleFactory* fileHandleFactory, core::ExpressionEvaluator* expressionEvaluator, - memory::MemoryAllocator* allocator, + cache::AsyncDataCache* cache, const std::string& scanId, - bool fileColumnNamesReadAsLowerCase, folly::Executor* executor, const dwio::common::ReaderOptions& options); @@ -162,7 +161,7 @@ class HiveDataSource : public DataSource { SelectivityVector filterRows_; exec::FilterEvalCtx filterEvalCtx_; - memory::MemoryAllocator* const allocator_; + cache::AsyncDataCache* const cache_{nullptr}; const std::string& scanId_; folly::Executor* executor_; }; diff --git a/velox/connectors/hive/tests/HiveConnectorTest.cpp b/velox/connectors/hive/tests/HiveConnectorTest.cpp index a8899dc465cc..fdbd12b994d2 100644 --- a/velox/connectors/hive/tests/HiveConnectorTest.cpp +++ b/velox/connectors/hive/tests/HiveConnectorTest.cpp @@ -184,11 +184,18 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_allSubscripts) { } TEST_F(HiveConnectorTest, makeScanSpec_filtersNotInRequiredSubfields) { - auto columnType = ROW({{"c0c0", BIGINT()}, {"c0c1", VARCHAR()}}); + auto columnType = ROW( + {{"c0c0", BIGINT()}, + {"c0c1", VARCHAR()}, + {"c0c2", ROW({{"c0c2c0", BIGINT()}})}, + {"c0c3", ROW({{"c0c3c0", BIGINT()}})}}); auto rowType = ROW({{"c0", columnType}}); - auto columnHandle = makeColumnHandle("c0", columnType, {"c0.c0c1"}); + auto columnHandle = + makeColumnHandle("c0", columnType, {"c0.c0c1", "c0.c0c3"}); SubfieldFilters filters; filters.emplace(Subfield("c0.c0c0"), exec::equal(42)); + filters.emplace(Subfield("c0.c0c2"), exec::isNotNull()); + filters.emplace(Subfield("c0.c0c3"), exec::isNotNull()); auto scanSpec = HiveDataSource::makeScanSpec( filters, rowType, {columnHandle.get()}, {}, pool_.get()); auto* c0c0 = scanSpec->childByName("c0")->childByName("c0c0"); @@ -197,6 +204,34 @@ TEST_F(HiveConnectorTest, makeScanSpec_filtersNotInRequiredSubfields) { auto* c0c1 = scanSpec->childByName("c0")->childByName("c0c1"); ASSERT_FALSE(c0c1->isConstant()); ASSERT_FALSE(c0c1->filter()); + auto* c0c2 = scanSpec->childByName("c0")->childByName("c0c2"); + ASSERT_FALSE(c0c2->isConstant()); + ASSERT_TRUE(c0c2->filter()); + ASSERT_TRUE(c0c2->childByName("c0c2c0")->isConstant()); + auto* c0c3 = scanSpec->childByName("c0")->childByName("c0c3"); + ASSERT_FALSE(c0c3->isConstant()); + ASSERT_TRUE(c0c3->filter()); + ASSERT_FALSE(c0c3->childByName("c0c3c0")->isConstant()); +} + +TEST_F(HiveConnectorTest, makeScanSpec_duplicateSubfields) { + auto c0Type = MAP(BIGINT(), MAP(BIGINT(), BIGINT())); + auto c1Type = MAP(VARCHAR(), MAP(BIGINT(), BIGINT())); + auto rowType = ROW({{"c0", c0Type}, {"c1", c1Type}}); + std::shared_ptr columnHandles[] = { + makeColumnHandle("c0", c0Type, {"c0[10][1]", "c0[10][2]"}), + makeColumnHandle("c1", c1Type, {"c1[\"foo\"][1]", "c1[\"foo\"][2]"}), + }; + auto scanSpec = HiveDataSource::makeScanSpec( + {}, + rowType, + {columnHandles[0].get(), columnHandles[1].get()}, + {}, + pool_.get()); + auto* c0 = scanSpec->childByName("c0"); + ASSERT_EQ(c0->children().size(), 2); + auto* c1 = scanSpec->childByName("c1"); + ASSERT_EQ(c1->children().size(), 2); } TEST_F(HiveConnectorTest, extractFiltersFromRemainingFilter) { diff --git a/velox/core/CMakeLists.txt b/velox/core/CMakeLists.txt index 86d171485ed9..f4db34c8f382 100644 --- a/velox/core/CMakeLists.txt +++ b/velox/core/CMakeLists.txt @@ -23,6 +23,7 @@ add_library(velox_core Expressions.cpp PlanFragment.cpp PlanNode.cpp target_link_libraries( velox_core + velox_caching velox_config velox_expression_functions velox_type diff --git a/velox/core/PlanNode.cpp b/velox/core/PlanNode.cpp index b9c336c43fab..41ab55952cfb 100644 --- a/velox/core/PlanNode.cpp +++ b/velox/core/PlanNode.cpp @@ -1346,19 +1346,29 @@ RowTypePtr getRowNumberOutputType( return ROW(std::move(names), std::move(types)); } + +RowTypePtr getOptionalRowNumberOutputType( + const RowTypePtr& inputType, + const std::optional& rowNumberColumnName) { + if (rowNumberColumnName) { + return getRowNumberOutputType(inputType, rowNumberColumnName.value()); + } + + return inputType; +} } // namespace RowNumberNode::RowNumberNode( PlanNodeId id, std::vector partitionKeys, - const std::string& rowNumberColumnName, + const std::optional& rowNumberColumnName, std::optional limit, PlanNodePtr source) : PlanNode(std::move(id)), partitionKeys_{std::move(partitionKeys)}, limit_{limit}, sources_{std::move(source)}, - outputType_(getRowNumberOutputType( + outputType_(getOptionalRowNumberOutputType( sources_[0]->outputType(), rowNumberColumnName)) {} @@ -1380,7 +1390,9 @@ void RowNumberNode::addDetails(std::stringstream& stream) const { folly::dynamic RowNumberNode::serialize() const { auto obj = PlanNode::serialize(); obj["partitionKeys"] = ISerializable::serialize(partitionKeys_); - obj["rowNumberColumnName"] = outputType_->names().back(); + if (generateRowNumber()) { + obj["rowNumberColumnName"] = outputType_->names().back(); + } if (limit_) { obj["limit"] = limit_.value(); } @@ -1398,26 +1410,19 @@ PlanNodePtr RowNumberNode::create(const folly::dynamic& obj, void* context) { limit = obj["limit"].asInt(); } + std::optional rowNumberColumnName; + if (obj.count("rowNumberColumnName")) { + rowNumberColumnName = obj["rowNumberColumnName"].asString(); + } + return std::make_shared( deserializePlanNodeId(obj), partitionKeys, - obj["rowNumberColumnName"].asString(), + rowNumberColumnName, limit, source); } -namespace { -RowTypePtr getTopNRowNumberOutputType( - const RowTypePtr& inputType, - const std::optional& rowNumberColumnName) { - if (rowNumberColumnName) { - return getRowNumberOutputType(inputType, rowNumberColumnName.value()); - } - - return inputType; -} -} // namespace - TopNRowNumberNode::TopNRowNumberNode( PlanNodeId id, std::vector partitionKeys, @@ -1432,7 +1437,7 @@ TopNRowNumberNode::TopNRowNumberNode( sortingOrders_{std::move(sortingOrders)}, limit_{limit}, sources_{std::move(source)}, - outputType_{getTopNRowNumberOutputType( + outputType_{getOptionalRowNumberOutputType( sources_[0]->outputType(), rowNumberColumnName)} { VELOX_USER_CHECK_EQ( @@ -1755,9 +1760,9 @@ PlanNodePtr PartitionedOutputNode::create( void* context) { return std::make_shared( deserializePlanNodeId(obj), + stringToKind(obj["kind"].asString()), ISerializable::deserialize>(obj["keys"], context), obj["numPartitions"].asInt(), - stringToKind(obj["kind"].asString()), obj["replicateNullsAndAny"].asBool(), ISerializable::deserialize( obj["partitionFunctionSpec"], context), diff --git a/velox/core/PlanNode.h b/velox/core/PlanNode.h index 6e8d4dfdc5c4..4d3fedf2c469 100644 --- a/velox/core/PlanNode.h +++ b/velox/core/PlanNode.h @@ -1097,14 +1097,13 @@ class PartitionedOutputNode : public PlanNode { PlanNodePtr source) : PartitionedOutputNode( id, + broadcast ? Kind::kBroadcast : Kind::kPartitioned, keys, numPartitions, - broadcast ? Kind::kBroadcast : Kind::kPartitioned, replicateNullsAndAny, partitionFunctionSpec, outputType, source) {} -#endif PartitionedOutputNode( const PlanNodeId& id, @@ -1115,11 +1114,31 @@ class PartitionedOutputNode : public PlanNode { PartitionFunctionSpecPtr partitionFunctionSpec, RowTypePtr outputType, PlanNodePtr source) + : PartitionedOutputNode( + id, + kind, + keys, + numPartitions, + replicateNullsAndAny, + std::move(partitionFunctionSpec), + std::move(outputType), + std::move(source)) {} +#endif + + PartitionedOutputNode( + const PlanNodeId& id, + Kind kind, + const std::vector& keys, + int numPartitions, + bool replicateNullsAndAny, + PartitionFunctionSpecPtr partitionFunctionSpec, + RowTypePtr outputType, + PlanNodePtr source) : PlanNode(id), + kind_(kind), sources_{{std::move(source)}}, keys_(keys), numPartitions_(numPartitions), - kind_(kind), replicateNullsAndAny_(replicateNullsAndAny), partitionFunctionSpec_(std::move(partitionFunctionSpec)), outputType_(std::move(outputType)) { @@ -1129,10 +1148,11 @@ class PartitionedOutputNode : public PlanNode { keys_.empty(), "Non-empty partitioning keys require more than one partition"); } - if (isBroadcast()) { + if (!isPartitioned()) { VELOX_CHECK( keys_.empty(), - "Broadcast partitioning doesn't allow for partitioning keys"); + "{} partitioning doesn't allow for partitioning keys", + kindString(kind_)); } } @@ -1144,9 +1164,9 @@ class PartitionedOutputNode : public PlanNode { std::vector noKeys; return std::make_shared( id, + Kind::kBroadcast, noKeys, numPartitions, - Kind::kBroadcast, false, std::make_shared(), std::move(outputType), @@ -1154,13 +1174,27 @@ class PartitionedOutputNode : public PlanNode { } static std::shared_ptr - single(const PlanNodeId& id, RowTypePtr outputType, PlanNodePtr source) { + arbitrary(const PlanNodeId& id, RowTypePtr outputType, PlanNodePtr source) { std::vector noKeys; return std::make_shared( id, + Kind::kArbitrary, noKeys, 1, + false, + std::make_shared(), + std::move(outputType), + std::move(source)); + } + + static std::shared_ptr + single(const PlanNodeId& id, RowTypePtr outputType, PlanNodePtr source) { + std::vector noKeys; + return std::make_shared( + id, Kind::kPartitioned, + noKeys, + 1, false, std::make_shared(), std::move(outputType), @@ -1187,10 +1221,18 @@ class PartitionedOutputNode : public PlanNode { return numPartitions_; } + bool isPartitioned() const { + return kind_ == Kind::kPartitioned; + } + bool isBroadcast() const { return kind_ == Kind::kBroadcast; } + bool isArbitrary() const { + return kind_ == Kind::kArbitrary; + } + Kind kind() const { return kind_; } @@ -1222,10 +1264,10 @@ class PartitionedOutputNode : public PlanNode { private: void addDetails(std::stringstream& stream) const override; + const Kind kind_; const std::vector sources_; const std::vector keys_; const int numPartitions_; - const Kind kind_; const bool replicateNullsAndAny_; const PartitionFunctionSpecPtr partitionFunctionSpec_; const RowTypePtr outputType_; @@ -2066,19 +2108,21 @@ class WindowNode : public PlanNode { /// Optimized version of a WindowNode for a single row_number function with an /// optional limit and no sorting. -/// The output of this node contains all input columns followed by a +/// The output of this node contains all input columns followed by an optional /// 'rowNumberColumnName' BIGINT column. class RowNumberNode : public PlanNode { public: /// @param partitionKeys Partitioning keys. May be empty. - /// @param rowNumberColumnName Name of the column containing row numbers. + /// @param rowNumberColumnName Optional name of the column containing row + /// numbers. If not specified, the output doesn't include 'row number' column. + /// This is used when computing partial results. /// @param limit Optional per-partition limit. If specified, the number of /// rows produced by this node will not exceed this value for any given /// partition. Extra rows will be dropped. RowNumberNode( PlanNodeId id, std::vector partitionKeys, - const std::string& rowNumberColumnName, + const std::optional& rowNumberColumnName, std::optional limit, PlanNodePtr source); @@ -2098,6 +2142,10 @@ class RowNumberNode : public PlanNode { return limit_; } + bool generateRowNumber() const { + return outputType_->size() > sources_[0]->outputType()->size(); + } + std::string_view name() const override { return "RowNumber"; } diff --git a/velox/core/QueryCtx.cpp b/velox/core/QueryCtx.cpp index cb06600b2fb5..2bfa52f6d9f1 100644 --- a/velox/core/QueryCtx.cpp +++ b/velox/core/QueryCtx.cpp @@ -21,13 +21,13 @@ QueryCtx::QueryCtx( folly::Executor* executor, std::unordered_map queryConfigValues, std::unordered_map> connectorConfigs, - memory::MemoryAllocator* allocator, + cache::AsyncDataCache* cache, std::shared_ptr pool, std::shared_ptr spillExecutor, const std::string& queryId) : queryId_(queryId), connectorConfigs_(connectorConfigs), - allocator_(allocator), + cache_(cache), pool_(std::move(pool)), executor_(executor), queryConfig_{std::move(queryConfigValues)}, @@ -39,12 +39,12 @@ QueryCtx::QueryCtx( folly::Executor::KeepAlive<> executorKeepalive, std::unordered_map queryConfigValues, std::unordered_map> connectorConfigs, - memory::MemoryAllocator* allocator, + cache::AsyncDataCache* cache, std::shared_ptr pool, const std::string& queryId) : queryId_(queryId), connectorConfigs_(connectorConfigs), - allocator_(allocator), + cache_(cache), pool_(std::move(pool)), executorKeepalive_(std::move(executorKeepalive)), queryConfig_{std::move(queryConfigValues)} { diff --git a/velox/core/QueryCtx.h b/velox/core/QueryCtx.h index d4f50874a998..7b0f178b5c75 100644 --- a/velox/core/QueryCtx.h +++ b/velox/core/QueryCtx.h @@ -17,8 +17,8 @@ #include #include +#include "velox/common/caching/AsyncDataCache.h" #include "velox/common/memory/Memory.h" -#include "velox/common/memory/MemoryAllocator.h" #include "velox/core/QueryConfig.h" #include "velox/vector/DecodedVector.h" #include "velox/vector/VectorPool.h" @@ -39,8 +39,7 @@ class QueryCtx { std::unordered_map queryConfigValues = {}, std::unordered_map> connectorConfigs = {}, - memory::MemoryAllocator* allocator = - memory::MemoryAllocator::getInstance(), + cache::AsyncDataCache* cache = cache::AsyncDataCache::getInstance(), std::shared_ptr pool = nullptr, std::shared_ptr spillExecutor = nullptr, const std::string& queryId = ""); @@ -54,8 +53,7 @@ class QueryCtx { std::unordered_map queryConfigValues = {}, std::unordered_map> connectorConfigs = {}, - memory::MemoryAllocator* allocator = - memory::MemoryAllocator::getInstance(), + cache::AsyncDataCache* cache = cache::AsyncDataCache::getInstance(), std::shared_ptr pool = nullptr, const std::string& queryId = ""); @@ -65,8 +63,8 @@ class QueryCtx { return pool_.get(); } - memory::MemoryAllocator* allocator() const { - return allocator_; + cache::AsyncDataCache* cache() const { + return cache_; } folly::Executor* executor() const { @@ -135,7 +133,7 @@ class QueryCtx { const std::string queryId_; std::unordered_map> connectorConfigs_; - memory::MemoryAllocator* allocator_; + cache::AsyncDataCache* cache_; std::shared_ptr pool_; folly::Executor* executor_; folly::Executor::KeepAlive<> executorKeepalive_; diff --git a/velox/docs/configs.rst b/velox/docs/configs.rst index bbb595e0d185..0d1fa0c93d32 100644 --- a/velox/docs/configs.rst +++ b/velox/docs/configs.rst @@ -286,7 +286,7 @@ Hive Connector - bool - false - True if reading the source file column names as lower case, and planner should guarantee - - the input column name and filter is also lower case to achive case-insensitive read.. + the input column name and filter is also lower case to achive case-insensitive read. * - max-coalesced-bytes - integer - 512KB @@ -327,7 +327,7 @@ Hive Connector - bool - false - Use path-style access for all requests to the S3-compatible storage. This is for S3-compatible storage that - doesn’t support virtual-hosted-style access. + doesn't support virtual-hosted-style access. * - hive.s3.ssl.enabled - bool - true diff --git a/velox/docs/develop.rst b/velox/docs/develop.rst index 4f132d68a914..35bac5e81730 100644 --- a/velox/docs/develop.rst +++ b/velox/docs/develop.rst @@ -22,7 +22,7 @@ This guide is intended for Velox contributors and developers of Velox-based appl develop/task develop/simd develop/spilling - develop/unsaferow + develop/serde develop/testing develop/debugging develop/TpchBenchmark diff --git a/velox/docs/develop/aggregate-functions.rst b/velox/docs/develop/aggregate-functions.rst index 234d230704c8..2ec4bc4be95e 100644 --- a/velox/docs/develop/aggregate-functions.rst +++ b/velox/docs/develop/aggregate-functions.rst @@ -583,27 +583,293 @@ You can see the documentation for all functions at :doc:`../functions/presto/agg Accumulator ----------- -Variable-width accumulators need to use :doc:`HashStringAllocator ` to allocate memory. An instance of the allocator is available in the base class: *velox::exec::Aggregate::allocator_*. +In Velox, efficient use of memory is a priority. This includes both optimizing +the total amount of memory used as well as the number of memory allocations. +Note that runtime statistics reported by Velox include both peak memory usage +(in bytes) and number of memory allocations for each operator. -Sometimes you’ll need to create a custom accumulator. Sometimes one of the existing accumulators would do the jobs. +Aggregate functions use memory to store intermediate results in the +accumulators. They allocate memory from an arena (:doc:`HashStringAllocator ` class). -SingleValueAccumulator used by :func:`min`, :func:`max` and :func:`arbitrary` functions can be used to store a single value of variable-width type, e.g. string, array, map or struct. +array_agg and ValueList +~~~~~~~~~~~~~~~~~~~~~~~ -ValueList accumulator used by :func:`array_agg` and :func:`map_agg` accumulates a list of values. This is an append-only accumulator. +StlAllocator is an STL-compatible allocator backed by HashStringAllocator that +can be used with STL containers. For example, one can define an std::vector +that allocates memory from the arena like so: -An StlAllocator defined in velox/exec/HashStringAllocator.h can be used to make STL containers (e.g. std::vector) backed by memory allocated via the HashStringAllocator. StlAllocator is not an accumulator itself, but can be used to design accumulators that use STL containers. It is used by :func:`approx_percentile` and :func:`approx_distinct`. +.. code-block:: c++ + + std::vector> + +This is used, for example, in 3-arg versions of :func:`min_by` and :func:`max_by` with +fixed-width type inputs (e.g. integers). + +There is also an AlignedStlAllocator that provides aligned allocations from the +arena and can be used with `F14 `_ +containers which require 16-byte alignment. One can define an F14FastMap that +allocates memory from the arena like so: -Memory allocated from the HashStringAllocator needs to be released in the destroy() method. See velox/aggregates/ArrayAgg.cpp for an example. .. code-block:: c++ - void destroy(folly::Range groups) override { - for (auto group : groups) { - if (auto header = value(group)->elements.begin()) { - allocator_->free(header); - } - } - } + folly::F14FastMap< + int64_t, + double, + std::hash, + std::equal_to, + AlignedStlAllocator, 16>> + +You can find an example usage in :func:`histogram` aggregation function. + +An :func:`array_agg` function on primitive types could be implemented using +std::vector, but it would not be efficient. Why is that? If one doesn’t +use ‘reserve’ method to provide a hint to std::vector about how many entries will be +added, the default behavior is to allocate memory in powers of 2, e.g. first +allocate 1 entry, then 2, then 4, 8, 16, etc. Every time new allocation is +made the data is copied into the new memory buffer and the old buffer is +released. One can see this by instrumenting StlAllocator::allocate and +deallocate methods to add logging and run a simple loop to add elements to a +vector: + +.. code-block:: c++ + + std::vector> data( + 0, StlAllocator(allocator_.get())); + + + for (auto i = 0; i < 100; ++i) { + data.push_back(i); + } + + +.. code-block:: text + + E20230714 14:57:33.717708 975289 HashStringAllocator.h:497] allocate 1 + E20230714 14:57:33.734280 975289 HashStringAllocator.h:497] allocate 2 + E20230714 14:57:33.734321 975289 HashStringAllocator.h:506] free 1 + E20230714 14:57:33.734352 975289 HashStringAllocator.h:497] allocate 4 + E20230714 14:57:33.734381 975289 HashStringAllocator.h:506] free 2 + E20230714 14:57:33.734416 975289 HashStringAllocator.h:497] allocate 8 + E20230714 14:57:33.734445 975289 HashStringAllocator.h:506] free 4 + E20230714 14:57:33.734481 975289 HashStringAllocator.h:497] allocate 16 + E20230714 14:57:33.734513 975289 HashStringAllocator.h:506] free 8 + E20230714 14:57:33.734544 975289 HashStringAllocator.h:497] allocate 32 + E20230714 14:57:33.734575 975289 HashStringAllocator.h:506] free 16 + E20230714 14:57:33.734606 975289 HashStringAllocator.h:497] allocate 64 + E20230714 14:57:33.734637 975289 HashStringAllocator.h:506] free 32 + E20230714 14:57:33.734668 975289 HashStringAllocator.h:497] allocate 128 + E20230714 14:57:33.734699 975289 HashStringAllocator.h:506] free 64 + E20230714 14:57:33.734731 975289 HashStringAllocator.h:506] free 128 + + +Reallocating memory and copying data is not cheap. To avoid this overhead we +introduced ValueList primitive and used it to implement array_agg. + +ValueList is an append-only data structure that allows appending values from any +Velox Vector and reading values back into a Velox Vector. ValueList doesn’t +require a contiguous chunk of memory and therefore doesn’t need to re-allocate +and copy when it runs out of space. It just allocates another chunk and starts +filling that up. + +ValueList is designed to work with data that comes from Velox Vectors, hence, +its API is different from std::vector. You append values from a DecodedVector +and read values back into a flat vector. Here is an example of usage: + +.. code-block:: c++ + + DecodedVector decoded(*data); + + // Store data. + ValueList values; + for (auto i = 0; i < 100; ++i) { + values.appendValue(decoded, i, allocator()); + } + + + // Read data back. + auto copy = BaseVector::create(DOUBLE(), 100, pool()); + aggregate::ValueListReader reader(values); + for (auto i = 0; i < 100; ++i) { + reader.next(*copy, i); + } + +ValueList supports all types, so you can use it to append fixed-width values as +well as strings, arrays, maps and structs. + +When storing complex types, ValueList serializes the values using +ContainerRowSerde. + +ValueList preserves the null flags as well, so you can store a list of nullable +values in it. + +The array_agg is implemented using ValueList for the accumulator. + +ValueList needs a pointer to the arena for appending data. It doesn’t take an +arena in the constructor and doesn’t store it, because that would require 8 +bytes of memory per group in the aggregation operator. Instead, +ValueList::appendValue method takes a pointer to the arena as an argument. +Consequently, ValueList’s destructor cannot release the memory back to the +arena and requires the user to explicitly call the free +(HashStringAllocator*) method. + +min, max, and SingleValueAccumulator +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:func:`min` and :func:`max` functions store a single value in the accumulator +(the current min or max value). They use SingleValueAccumulator to store +strings, arrays, maps and structs. When processing a new value, we compare +it with the stored value and replace the stored value if necessary. + +Similar to ValueList, SingleValueAccumulator serializes the values using +ContainerRowSerde. SingleValueAccumulator provides a compare method to compare +stored value with a row of a DecodedVector. + +This accumulator is also used in the implementation of the :func:`arbitrary` +aggregate function which stores the first value in the accumulator. + +set_agg, set_union, Strings and AddressableNonNullValueList +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:func:`set_agg` function accumulates a set of unique values into an F14FastSet +configured to allocate memory from the arena via AlignedStlAllocator. +Fixed-width values are stored directly in F14FastSet. Memory allocation pattern +for F14 data structures is similar to std::vector. F14 allocates memory in +powers on 2, copies data and frees previously allocated memory. Hence, we do +not store strings directly in the F14 set. Instead, Velox writes strings into +the arena and stores a StringView pointing to the arena in the set. + +In general, when writing to the arena, one is not guaranteed a contiguous write. +However, for StringViews to work we must ensure that strings written into the +arena are contiguous. Strings helper class provides this functionality. Its +append method takes a StringView and a pointer to the arena, copies the string +into the arena and returns a StringView pointing to the copy. + +.. code-block:: c++ + + /// Copies the string into contiguous memory allocated via + /// HashStringAllocator. Returns StringView over the copy. + StringView append(StringView value, HashStringAllocator& allocator); + +Strings class provides a free method to release memory back to the arena. + +.. code-block:: c++ + + /// Frees memory used by the strings. StringViews returned from 'append' + /// become invalid after this call. + void free(HashStringAllocator& allocator); + +When aggregating complex types (arrays, maps or structs), we use +AddressableNonNullValueList which writes values to the arena and returns +a “pointer” to the written value which we store in the F14 set. +AddressableNonNullValueList provides methods to compute a hash of a value and +compare two values. AddressableNonNullValueList uses ContainerRowSerde for +serializing data and comparing serialized values. + + +.. code-block:: c++ + + /// A set of pointers to values stored in AddressableNonNullValueList. + SetAccumulator< + HashStringAllocator::Position, + AddressableNonNullValueList::Hash, + AddressableNonNullValueList::EqualTo> + base; + +AddressableNonNullValueList allows to append a value and erase the last value. +This functionality is sufficient for set_agg and set_union. When processing a +new value, we append it to the list, get a “pointer”, insert that “pointer” +into F14 set and if the “pointer” points to a duplicate value we remove it from +the list. + +Like all other arena-based accumulators, AddressableNonNullValueList provides a +free method to return memory back to the arena. + +Note: AddressableNonNullValueList is different from ValueList in that it +provides access to individual values (hence, the “Addressable” prefix in the +name) while ValueList does not. With ValueList one can append values, then copy +all the values into a Vector. Adhoc access to individual elements is not +available in ValueList. + +SetAccumulator template implements a simple interface to accumulate unique +values. It is implemented using F14FastSet, Strings and +AddressableNonNullValueList. T can be a fixed-width type like int32_t or +int64_t, StringView or ComplexType. + +addValue and addValues method allow to add one or multiple values from a vector. + +.. code-block:: c++ + + /// Adds value if new. No-op if the value was added before. + void addValue( + const DecodedVector& decoded, + vector_size_t index, + HashStringAllocator* allocator)/// Adds new values from an array. + + void addValues( + const ArrayVector& arrayVector, + vector_size_t index, + const DecodedVector& values, + HashStringAllocator* allocator) + +size() method returns the number of unique values. + +.. code-block:: c++ + + /// Returns number of unique values including null. + size_t size() const + +extractValues method allows to extract unique values into a vector. + +.. code-block:: c++ + + /// Copies the unique values and null into the specified vector starting at + /// the specified offset. + vector_size_t extractValues(FlatVector& values, vector_size_t offset) + + /// For complex types. + vector_size_t extractValues(BaseVector& values, vector_size_t offset) + +Both :func:`set_agg` and :func:`set_union` functions are implemented using +SetAccumulator. + +map_agg, map_union, and MapAccumulator +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:func:`map_agg` function accumulates keys and values into a map. It discards +duplicate keys and keeps only one value for each unique key. Map_agg uses +MapAccumulator template to accumulate the values. Similar to SetAccumulator, +MapAccumulator is built using F14FastMap, AlignedStlAllocator, Strings and +AddressableNonNullValueList. + +insert() method adds a pair of (key, value) to the map discarding the value if matching key already exists. + +.. code-block:: c++ + + /// Adds key-value pair if entry with that key doesn't exist yet. + void insert( + const DecodedVector& decodedKeys, + const DecodedVector& decodedValues, + vector_size_t index, + HashStringAllocator& allocator) + +size() method returns the number of unique values. + +extract() method copies the keys and the values into vectors, which can be combined to form a MapVector. + +.. code-block:: c++ + + void extract( + const VectorPtr& mapKeys, + const VectorPtr& mapValues, + vector_size_t offset) + +Both :func:`map_agg` and :func:`map_union` functions are implemented using +MapAccumulator. + +When implementing new aggregate functions, consider using ValueList, +SingleValueAccumulator, Strings, AddressableNonNullValueList and F14 +containers to put together an accumulator that uses memory efficiently. End-to-End Testing ------------------ diff --git a/velox/docs/develop/operators.rst b/velox/docs/develop/operators.rst index 3dc7ab1649a7..18cbadab9152 100644 --- a/velox/docs/develop/operators.rst +++ b/velox/docs/develop/operators.rst @@ -89,7 +89,7 @@ with HiveConnector, table scan reads data from ORC or Parquet files. ArrowStreamNode ~~~~~~~~~~~~~~~ -The Arrow stream operation reads data from an Arrow array stream. The ArrowArrayStream structure is defined in Arrow abi, +The Arrow stream operation reads data from an Arrow array stream. The ArrowArrayStream structure is defined in Arrow abi, and provides the required callbacks to interact with a streaming source of Arrow arrays. .. list-table:: @@ -583,8 +583,8 @@ each batch of input it computes and returns the results before accepting the next batch of input. This operator accumulates state: a hash table mapping partition keys to total -number of rows seen in this partition so far. This operator doesn't support -spilling yet. +number of rows seen in this partition so far. Returning the row numbers as +a column in the output is optional. This operator doesn't support spilling yet. This operator is equivalent to a WindowNode followed by FilterNode(row_number <= limit), but it uses less memory and CPU and makes @@ -600,7 +600,7 @@ results available before seeing all input. * - partitionKeys - Partition by columns. * - rowNumberColumnName - - Output column name for the row numbers. + - Optional output column name for the row numbers. If specified, the generated row numbers are returned as an output column appearing after all input columns. * - limit - Optional per-partition limit. If specified, the number of rows produced by this node will not exceed this value for any given partition. Extra rows will be dropped. @@ -615,8 +615,8 @@ a 'limit' number of top rows for each partition. After receiving all input, assigns row numbers within each partition starting from 1. This operator accumulates state: a hash table mapping partition keys to a list -of top 'limit' rows within that partition. This operator doesn't support -spilling yet. +of top 'limit' rows within that partition. Returning the row numbers as +a column in the output is optional. This operator doesn't support spilling yet. This operator is logically equivalent to a WindowNode followed by FilterNode(row_number <= limit), but it uses less memory and CPU. @@ -635,7 +635,7 @@ FilterNode(row_number <= limit), but it uses less memory and CPU. * - sortingOrders - Sorting order for each sorting key above. The supported sort orders are asc nulls first, asc nulls last, desc nulls first and desc nulls last. * - rowNumberColumnName - - Output column name for the row numbers. + - Optional output column name for the row numbers. If specified, the generated row numbers are returned as an output column appearing after all input columns. * - limit - Per-partition limit. If specified, the number of rows produced by this node will not exceed this value for any given partition. Extra rows will be dropped. diff --git a/velox/docs/develop/serde.rst b/velox/docs/develop/serde.rst new file mode 100644 index 000000000000..8b367c5baaf8 --- /dev/null +++ b/velox/docs/develop/serde.rst @@ -0,0 +1,29 @@ +********************* +Serialization Formats +********************* + +Velox supports three data serialization formats that can be used for data shuffle: +`PrestoPage `_, +UnsafeRow and CompactRow. PrestoPage is a columnar format. UnsafeRow and CompactRow +are row-wise formats. + +Velox applications can register their own formats as well. + +PrestoPage format is described in the `Presto documentation `_. + +UnsafeRow format comes from `Apache Spark `_. + +CompactRow is similar to UnsafeRow, but it is more space efficient and results in +fewer bytes shuffled which has a cascading effect on CPU usage (for compression +and checksumming) and memory (for buffering). + +The details of UnsafeRow and CompactRow formats can be found in the following articles. + +.. toctree:: + :maxdepth: 1 + + serde/unsaferow + serde/compactrow + +Velox also uses another row-wise serialization format, ContainerRowSerde, for storing +data in aggregation and join operators. This format is similar to CompactRow. diff --git a/velox/docs/develop/serde/compactrow.rst b/velox/docs/develop/serde/compactrow.rst new file mode 100644 index 000000000000..bb40c909b87e --- /dev/null +++ b/velox/docs/develop/serde/compactrow.rst @@ -0,0 +1,118 @@ +========== +CompactRow +========== + +CompactRow is a row-wise serialization format provided by Velox as an +alternative to UnsafeRow format. CompactRow is more space efficient then +UnsafeRow and results in fewer bytes shuffled which has a cascading effect on +CPU usage (for compression and checksumming) and memory (for buffering). + +A row is a contiguous buffer that starts with null flags, followed by individual +fields. + +nulls | field1 | field 2 | … + +Nulls section uses one bit per field to indicate which fields are null. If there +are 10 fields, there will be 2 bytes of null flags (16 bits total, 10 bits +used, 6 bits unused). + +Fixed-width fields (integers, boolean, floating point numbers) take up a fixed +number of bytes regardless of whether they are null or not. A row with 10 +bigint fields takes up 2 + 10 * 8 = 82 bytes. 2 bytes for null flags + 8 bytes +per field. + +The sizes of fixed-width fields are: + +================ ============================================== +Type Number of bytes used for serialization +================ ============================================== +BOOLEAN 1 +TINYINT 1 +SMALLINT 2 +INTEGER 4 +BIGINT 8 +HUGEINT 16 +REAL 4 +DOUBLE 8 +TIMESTAMP 8 +UNKNOWN 0 +================ ============================================== + +Strings (VARCHAR and VARBINARY) use 4 bytes for size plus the length of the +string. Empty string uses 4 bytes. 1-character string uses 5 bytes. +20-character ASCII string uses 24 bytes. Null strings do not take up space +(other than one bit in the nulls section). + +Arrays of fixed-width values or strings, e.g. arrays of integers, use 4 bytes +for the size of the array, a few bytes for nulls flags indicating null-ness of +the elements (1 bit per element) plus the space taken by the elements +themselves. + +For example, an array of 5 integers [1, 2, 3, 4, 5] uses 4 bytes for size, 1 +byte for 5 null flags and 5 * 4 bytes for 5 values. A total of 25 bytes. + + +============ ==== ======== ====== ====== ====== ====== ====== +Description Size Nulls Elem 1 Elem 2 Elem 3 Elem 4 Elem 5 +============ ==== ======== ====== ====== ====== ====== ====== +# of bytes 4 1 4 4 4 4 4 +Value 5 00000000 1 2 3 4 5 +============ ==== ======== ====== ====== ====== ====== ====== + +An array of 4 strings [null, “Abc”, null, “Mountains and rivers”] uses 36 bytes: + +============ ==== ======== ======= ====== ======= ===================== +Description Size Nulls Size s2 s2 Size s4 s4 +============ ==== ======== ======= ====== ======= ===================== +# of bytes 4 1 4 3 4 20 +Value 4 10100000 1 Abc 20 Mountains and rivers +============ ==== ======== ======= ====== ======= ===================== + +Serialization of an array of complex type elements, e.g. an array of arrays, maps or structs, includes a few additional fields: the total serialized size plus offset of each element in the serialized buffer. + +- 4 bytes - array size. +- N bytes - null flags, 1 bit per element. +- 4 bytes - Total serialized size of the array excluding first 2 fields (size and nulls). +- 4 bytes per element - Offsets of the elements in the serialized buffer relative to the position right after the total serialized size. +- Elements. + +For example, an array of integers [[1, 2, 3], [4, 5], [6]] uses N bytes: + +- 4 bytes - size - 3 +- 1 byte - nulls - 00000000 +- 4 bytes - total serialized size - 55 +- 4 bytes - offset of the 1st element - 12 +- 4 bytes - offset of the 2nd element - 29 +- 4 bytes - offset of the 3rd element - 42 +- —-- Start of the 1st element: [1, 2, 3] +- 4 bytes - size - 3 +- 1 byte - nulls - 00000000 +- 4 bytes - element 1 - 1 +- 4 bytes - element 2 - 2 +- 4 bytes - element 3 - 3 +- —-- Start of the 2nd element: [4, 5] +- 4 bytes - size - 2 +- 1 byte - nulls - 00000000 +- 4 bytes - element 1 - 4 +- 4 bytes - element 2 - 5 +- —-- Start of the 2nd element: [6] +- 4 bytes - size - 1 +- 1 byte - nulls - 00000000 +- 4 bytes - element 1 - 6 + +A map is serialized as the keys array followed by the values array. + +A struct is serialized the same as the top-level row. + +Compared to UnsafeRow, on average CompactRow serialization is about twice shorter. Some examples are: + +====================== ========= ========== +Type UnsafeRow CompactRow +====================== ========= ========== +INTEGER 8 4 +BIGINT 8 8 +REAL 8 4 +DOUBLE 8 8 +VARCHAR: “” (empty) 8 4 +VARCHAR: “Abc” 16 7 +====================== ========= ========== diff --git a/velox/docs/develop/unsaferow.rst b/velox/docs/develop/serde/unsaferow.rst similarity index 98% rename from velox/docs/develop/unsaferow.rst rename to velox/docs/develop/serde/unsaferow.rst index c73b19555631..b64b44e5f0b2 100644 --- a/velox/docs/develop/unsaferow.rst +++ b/velox/docs/develop/serde/unsaferow.rst @@ -1,6 +1,6 @@ -============================== -UnsafeRow Serialization Format -============================== +========= +UnsafeRow +========= Velox supports two data serialization formats out of the box: `PrestoPage `_ diff --git a/velox/docs/functions/presto/math.rst b/velox/docs/functions/presto/math.rst index beea7aff0acc..6ddf0c2e3f95 100644 --- a/velox/docs/functions/presto/math.rst +++ b/velox/docs/functions/presto/math.rst @@ -260,6 +260,16 @@ Probability Functions: cdf Compute the Cauchy cdf with given parameters median and scale (gamma): P(N; median, scale). The scale parameter must be a positive double. The value parameter must be a double on the interval [0, 1]. +.. function:: chi_squared_cdf(df, value) -> double + + Compute the Chi-square cdf with given df (degrees of freedom) parameter: P(N < value; df). + The df parameter must be a positive real number, and value must be a non-negative real value (both of type DOUBLE). + +.. function:: f_cdf(df1, df2, value) -> double + + Compute the F cdf with given df1 (numerator degrees of freedom) and df2 (denominator degrees of freedom) parameters: P(N < value; df1, df2). + The numerator and denominator df parameters must be positive real numbers. The value must be a non-negative real number. + .. function:: normal_cdf(mean, sd, value) -> double Compute the Normal cdf with given mean and standard deviation (sd): P(N < value; mean, sd). diff --git a/velox/docs/functions/spark/string.rst b/velox/docs/functions/spark/string.rst index f3ecc78a688f..c1f433a53ece 100644 --- a/velox/docs/functions/spark/string.rst +++ b/velox/docs/functions/spark/string.rst @@ -11,6 +11,8 @@ Unless specified otherwise, all functions return NULL if at least one of the arg .. spark:function:: chr(n) -> varchar Returns the Unicode code point ``n`` as a single character string. + If ``n < 0``, the result is an empty string. + If ``n >= 256``, the result is equivalent to chr(``n % 256``). .. spark:function:: contains(left, right) -> boolean diff --git a/velox/duckdb/CMakeLists.txt b/velox/duckdb/CMakeLists.txt index dabc52b1fc97..eee20a7f868f 100644 --- a/velox/duckdb/CMakeLists.txt +++ b/velox/duckdb/CMakeLists.txt @@ -12,4 +12,3 @@ # See the License for the specific language governing permissions and # limitations under the License. add_subdirectory(conversion) -add_subdirectory(memory) diff --git a/velox/duckdb/memory/Allocator.cpp b/velox/duckdb/memory/Allocator.cpp deleted file mode 100644 index 3f473de5c012..000000000000 --- a/velox/duckdb/memory/Allocator.cpp +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "velox/duckdb/memory/Allocator.h" - -namespace facebook::velox::duckdb { - -::duckdb::data_ptr_t veloxPoolAllocate( - ::duckdb::PrivateAllocatorData* privateData, - ::duckdb::idx_t size) { - auto veloxPrivateData = dynamic_cast(privateData); - VELOX_CHECK(veloxPrivateData); - return static_cast<::duckdb::data_ptr_t>( - veloxPrivateData->pool.allocate(size)); -} - -void veloxPoolFree( - ::duckdb::PrivateAllocatorData* privateData, - ::duckdb::data_ptr_t pointer, - ::duckdb::idx_t size) { - auto veloxPrivateData = dynamic_cast(privateData); - VELOX_CHECK(veloxPrivateData); - veloxPrivateData->pool.free(pointer, size); -} - -::duckdb::data_ptr_t veloxPoolReallocate( - ::duckdb::PrivateAllocatorData* privateData, - ::duckdb::data_ptr_t pointer, - ::duckdb::idx_t oldSize, - ::duckdb::idx_t size) { - auto veloxPrivateData = dynamic_cast(privateData); - VELOX_CHECK(veloxPrivateData); - return static_cast<::duckdb::data_ptr_t>( - veloxPrivateData->pool.reallocate(pointer, oldSize, size)); -} - -VeloxPoolAllocator& getDefaultAllocator() { - static std::shared_ptr pool = - memory::addDefaultLeafMemoryPool("VeloxPoolAllocator"); - static VeloxPoolAllocator allocator{*pool}; - return allocator; -} - -} // namespace facebook::velox::duckdb diff --git a/velox/duckdb/memory/Allocator.h b/velox/duckdb/memory/Allocator.h deleted file mode 100644 index 692a4ce582f1..000000000000 --- a/velox/duckdb/memory/Allocator.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "velox/common/memory/Memory.h" -#include "velox/external/duckdb/duckdb.hpp" - -namespace facebook::velox::duckdb { - -struct PrivateVeloxAllocatorData : public ::duckdb::PrivateAllocatorData { - explicit PrivateVeloxAllocatorData(memory::MemoryPool& pool_) : pool(pool_) {} - - ~PrivateVeloxAllocatorData() override {} - - memory::MemoryPool& pool; -}; - -::duckdb::data_ptr_t veloxPoolAllocate( - ::duckdb::PrivateAllocatorData* privateData, - ::duckdb::idx_t size); - -void veloxPoolFree( - ::duckdb::PrivateAllocatorData* privateData, - ::duckdb::data_ptr_t pointer, - ::duckdb::idx_t size); - -::duckdb::data_ptr_t veloxPoolReallocate( - ::duckdb::PrivateAllocatorData* privateData, - ::duckdb::data_ptr_t pointer, - ::duckdb::idx_t oldSize, - ::duckdb::idx_t size); - -class VeloxPoolAllocator : public ::duckdb::Allocator { - public: - explicit VeloxPoolAllocator(memory::MemoryPool& pool) - : ::duckdb::Allocator( - veloxPoolAllocate, - veloxPoolFree, - veloxPoolReallocate, - std::make_unique(pool)) {} -}; - -VeloxPoolAllocator& getDefaultAllocator(); - -} // namespace facebook::velox::duckdb diff --git a/velox/duckdb/memory/CMakeLists.txt b/velox/duckdb/memory/CMakeLists.txt deleted file mode 100644 index e8f751418382..000000000000 --- a/velox/duckdb/memory/CMakeLists.txt +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -add_library(velox_duckdb_allocator Allocator.cpp) - -target_link_libraries(velox_duckdb_allocator velox_dwio_common duckdb fmt::fmt) - -if(NOT VELOX_DISABLE_GOOGLETEST) - target_link_libraries(velox_duckdb_allocator gtest) -endif() diff --git a/velox/dwio/common/CMakeLists.txt b/velox/dwio/common/CMakeLists.txt index 7f83771b3ea9..080d0f7826e2 100644 --- a/velox/dwio/common/CMakeLists.txt +++ b/velox/dwio/common/CMakeLists.txt @@ -20,8 +20,6 @@ elseif(${VELOX_BUILD_TEST_UTILS}) add_subdirectory(tests/utils) endif() -include_directories(/opt/homebrew/opt/protobuf/include) - add_library( velox_dwio_common BitConcatenation.cpp @@ -54,6 +52,8 @@ add_library( TypeWithId.cpp WriterFactory.cpp) +target_include_directories(velox_dwio_common PRIVATE ${Protobuf_INCLUDE_DIRS}) + target_link_libraries( velox_dwio_common velox_buffer diff --git a/velox/dwio/common/CachedBufferedInput.cpp b/velox/dwio/common/CachedBufferedInput.cpp index 9c20aa580491..f8fbf33f68ea 100644 --- a/velox/dwio/common/CachedBufferedInput.cpp +++ b/velox/dwio/common/CachedBufferedInput.cpp @@ -87,8 +87,9 @@ bool CachedBufferedInput::shouldPreload(int32_t numPages) { memory::AllocationTraits::kPageSize; } auto cachePages = cache_->incrementCachedPages(0); - auto maxPages = memory::AllocationTraits::numPages(cache_->capacity()); - auto allocatedPages = cache_->numAllocated(); + auto allocator = cache_->allocator(); + auto maxPages = memory::AllocationTraits::numPages(allocator->capacity()); + auto allocatedPages = allocator->numAllocated(); if (numPages < maxPages - allocatedPages) { // There is free space for the read-ahead. return true; diff --git a/velox/dwio/common/SelectiveColumnReader.cpp b/velox/dwio/common/SelectiveColumnReader.cpp index 78754bf37db7..4b6624245139 100644 --- a/velox/dwio/common/SelectiveColumnReader.cpp +++ b/velox/dwio/common/SelectiveColumnReader.cpp @@ -126,22 +126,27 @@ void SelectiveColumnReader::prepareNulls( simd::memset(rawResultNulls_, bits::kNotNullByte, resultNulls_->capacity()); } -bool SelectiveColumnReader::shouldMoveNulls(RowSet rows) { - if (rows.size() == numValues_) { +const uint64_t* SelectiveColumnReader::shouldMoveNulls(RowSet rows) { + if (rows.size() == numValues_ || !anyNulls_) { // Nulls will only be moved if there is a selection on values. A cast // alone does not move nulls. - return false; + return nullptr; } - VELOX_CHECK( - !returnReaderNulls_, - "Do not return reader nulls if retrieving a subset of values"); - if (anyNulls_) { - VELOX_CHECK( - resultNulls_ && resultNulls_->as() == rawResultNulls_); - VELOX_CHECK_GT(resultNulls_->capacity() * 8, rows.size()); - return true; + const uint64_t* moveFrom = rawResultNulls_; + if (returnReaderNulls_) { + if (!(resultNulls_ && resultNulls_->unique() && + resultNulls_->capacity() >= rows.size() + simd::kPadding)) { + resultNulls_ = AlignedBuffer::allocate( + rows.size() + (simd::kPadding * 8), &memoryPool_); + rawResultNulls_ = resultNulls_->asMutable(); + } + moveFrom = nullsInReadRange_->as(); + bits::copyBits(moveFrom, 0, rawResultNulls_, 0, rows.size()); + returnReaderNulls_ = false; } - return false; + VELOX_CHECK(resultNulls_ && resultNulls_->as() == rawResultNulls_); + VELOX_CHECK_GT(resultNulls_->capacity() * 8, rows.size()); + return moveFrom; } void SelectiveColumnReader::getIntValues( @@ -257,7 +262,7 @@ void SelectiveColumnReader::compactScalarValues( auto rawBits = reinterpret_cast(rawValues_); vector_size_t rowIndex = 0; auto nextRow = rows[rowIndex]; - bool moveNulls = shouldMoveNulls(rows); + auto* moveNullsFrom = shouldMoveNulls(rows); for (size_t i = 0; i < numValues_; i++) { if (outputRows_[i] < nextRow) { continue; @@ -266,9 +271,8 @@ void SelectiveColumnReader::compactScalarValues( VELOX_DCHECK(outputRows_[i] == nextRow); bits::setBit(rawBits, rowIndex, bits::isBitSet(rawBits, i)); - if (moveNulls && rowIndex != i) { - bits::setBit( - rawResultNulls_, rowIndex, bits::isBitSet(rawResultNulls_, i)); + if (moveNullsFrom && rowIndex != i) { + bits::setBit(rawResultNulls_, rowIndex, bits::isBitSet(moveNullsFrom, i)); } if (!isFinal) { outputRows_[rowIndex] = nextRow; @@ -289,7 +293,7 @@ char* SelectiveColumnReader::copyStringValue(folly::StringPiece value) { if (stringBuffers_.empty() || rawStringUsed_ + size > rawStringSize_) { auto bytes = std::max(size, kStringBufferSize); BufferPtr buffer = AlignedBuffer::allocate(bytes, &memoryPool_); - // Use the prefered size instead of the requested one to improve memory + // Use the preferred size instead of the requested one to improve memory // efficiency. buffer->setSize(buffer->capacity()); stringBuffers_.push_back(buffer); diff --git a/velox/dwio/common/SelectiveColumnReader.h b/velox/dwio/common/SelectiveColumnReader.h index eca0f34fb931..90541b29483f 100644 --- a/velox/dwio/common/SelectiveColumnReader.h +++ b/velox/dwio/common/SelectiveColumnReader.h @@ -496,9 +496,10 @@ class SelectiveColumnReader { template void upcastScalarValues(RowSet rows); - // Returns true if compactScalarValues and upcastScalarValues should - // move null flags. Checks consistency of nulls-related state. - bool shouldMoveNulls(RowSet rows); + // Return the source null bits if compactScalarValues and upcastScalarValues + // should move null flags. Return nullptr if nulls does not need to be moved. + // Checks consistency of nulls-related state. + const uint64_t* shouldMoveNulls(RowSet rows); void addStringValue(folly::StringPiece value); diff --git a/velox/dwio/common/SelectiveColumnReaderInternal.h b/velox/dwio/common/SelectiveColumnReaderInternal.h index 1a8a69909077..0b3c0f37d599 100644 --- a/velox/dwio/common/SelectiveColumnReaderInternal.h +++ b/velox/dwio/common/SelectiveColumnReaderInternal.h @@ -183,7 +183,7 @@ void SelectiveColumnReader::upcastScalarValues(RowSet rows) { } vector_size_t rowIndex = 0; auto nextRow = rows[rowIndex]; - bool moveNulls = shouldMoveNulls(rows); + auto* moveNullsFrom = shouldMoveNulls(rows); for (size_t i = 0; i < numValues_; i++) { if (sourceRows[i] < nextRow) { continue; @@ -191,9 +191,8 @@ void SelectiveColumnReader::upcastScalarValues(RowSet rows) { VELOX_DCHECK(sourceRows[i] == nextRow); buf[rowIndex] = typedSourceValues[i]; - if (moveNulls && rowIndex != i) { - bits::setBit( - rawResultNulls_, rowIndex, bits::isBitSet(rawResultNulls_, i)); + if (moveNullsFrom && rowIndex != i) { + bits::setBit(rawResultNulls_, rowIndex, bits::isBitSet(moveNullsFrom, i)); } valueRows_[rowIndex] = nextRow; rowIndex++; @@ -239,7 +238,7 @@ void SelectiveColumnReader::compactScalarValues(RowSet rows, bool isFinal) { } vector_size_t rowIndex = 0; auto nextRow = rows[rowIndex]; - bool moveNulls = shouldMoveNulls(rows); + auto* moveNullsFrom = shouldMoveNulls(rows); for (size_t i = 0; i < numValues_; i++) { if (sourceRows[i] < nextRow) { continue; @@ -247,9 +246,8 @@ void SelectiveColumnReader::compactScalarValues(RowSet rows, bool isFinal) { VELOX_DCHECK(sourceRows[i] == nextRow); typedDestValues[rowIndex] = typedSourceValues[i]; - if (moveNulls && rowIndex != i) { - bits::setBit( - rawResultNulls_, rowIndex, bits::isBitSet(rawResultNulls_, i)); + if (moveNullsFrom && rowIndex != i) { + bits::setBit(rawResultNulls_, rowIndex, bits::isBitSet(moveNullsFrom, i)); } if (!isFinal) { valueRows_[rowIndex] = nextRow; @@ -310,7 +308,7 @@ void SelectiveColumnReader::compactComplexValues( } vector_size_t rowIndex = 0; auto nextRow = rows[rowIndex]; - bool moveNulls = shouldMoveNulls(rows); + auto* moveNullsFrom = shouldMoveNulls(rows); for (size_t i = 0; i < numValues_; i++) { if (sourceRows[i] < nextRow) { continue; @@ -319,9 +317,8 @@ void SelectiveColumnReader::compactComplexValues( VELOX_DCHECK(sourceRows[i] == nextRow); // The value at i is moved to be the value at 'rowIndex'. move(i, rowIndex); - if (moveNulls && rowIndex != i) { - bits::setBit( - rawResultNulls_, rowIndex, bits::isBitSet(rawResultNulls_, i)); + if (moveNullsFrom && rowIndex != i) { + bits::setBit(rawResultNulls_, rowIndex, bits::isBitSet(moveNullsFrom, i)); } if (!isFinal) { valueRows_[rowIndex] = nextRow; diff --git a/velox/dwio/common/SelectiveStructColumnReader.cpp b/velox/dwio/common/SelectiveStructColumnReader.cpp index 5ea0249c8898..bf8826f485b3 100644 --- a/velox/dwio/common/SelectiveStructColumnReader.cpp +++ b/velox/dwio/common/SelectiveStructColumnReader.cpp @@ -128,8 +128,8 @@ void SelectiveStructColumnReaderBase::read( activeRows = outputRows_; } - VELOX_CHECK(!children_.empty()); auto& childSpecs = scanSpec_->children(); + VELOX_CHECK(!childSpecs.empty()); for (size_t i = 0; i < childSpecs.size(); ++i) { auto& childSpec = childSpecs[i]; if (isChildConstant(*childSpec)) { @@ -293,7 +293,7 @@ void setNullField(vector_size_t size, VectorPtr& field) { void SelectiveStructColumnReaderBase::getValues( RowSet rows, VectorPtr* result) { - VELOX_CHECK(!children_.empty()); + VELOX_CHECK(!scanSpec_->children().empty()); VELOX_CHECK( *result != nullptr, "SelectiveStructColumnReaderBase expects a non-null result"); diff --git a/velox/dwio/dwrf/test/CacheInputTest.cpp b/velox/dwio/dwrf/test/CacheInputTest.cpp index a31d82146590..3bb9c588d2c2 100644 --- a/velox/dwio/dwrf/test/CacheInputTest.cpp +++ b/velox/dwio/dwrf/test/CacheInputTest.cpp @@ -114,6 +114,9 @@ class CacheTest : public testing::Test { if (ssdCache) { ssdCache->deleteFiles(); } + if (cache_) { + cache_->prepareShutdown(); + } } void initializeCache(uint64_t maxBytes, uint64_t ssdBytes = 0) { @@ -130,10 +133,8 @@ class CacheTest : public testing::Test { } memory::MmapAllocator::Options options; options.capacity = maxBytes; - cache_ = std::make_shared( - std::make_shared(options), - maxBytes, - std::move(ssd)); + allocator_ = std::make_shared(options); + cache_ = AsyncDataCache::create(allocator_.get(), std::move(ssd)); cache_->setVerifyHook(checkEntry); for (auto i = 0; i < kMaxStreams; ++i) { streamIds_.push_back(std::make_unique( @@ -424,6 +425,7 @@ class CacheTest : public testing::Test { folly::F14FastMap> pathToInput_; std::shared_ptr tempDirectory_; cache::FileGroupStats* FOLLY_NULLABLE groupStats_ = nullptr; + std::shared_ptr allocator_; std::shared_ptr cache_; std::shared_ptr ioStats_; std::unique_ptr executor_; @@ -471,7 +473,7 @@ TEST_F(CacheTest, window) { auto cacheInput = dynamic_cast(stream.get()); EXPECT_TRUE(cacheInput != nullptr); auto maxSize = - cache_->sizeClasses().back() * memory::AllocationTraits::kPageSize; + allocator_->sizeClasses().back() * memory::AllocationTraits::kPageSize; const void* buffer; int32_t size; int32_t numRead = 0; diff --git a/velox/dwio/dwrf/test/ReaderTest.cpp b/velox/dwio/dwrf/test/ReaderTest.cpp index 1a349c4abb68..fb58f9ba3bcb 100644 --- a/velox/dwio/dwrf/test/ReaderTest.cpp +++ b/velox/dwio/dwrf/test/ReaderTest.cpp @@ -1893,3 +1893,39 @@ TEST(TestReader, reuseRowNumberColumn) { ASSERT_NE(rowNum.get(), result->asUnchecked()->childAt(1).get()); } } + +TEST(TestReader, failToReuseReaderNulls) { + auto* pool = defaultPool.get(); + VectorMaker maker(pool); + auto c0 = maker.rowVector( + {"a", "b"}, + { + maker.flatVector(11, folly::identity), + maker.flatVector( + 11, folly::identity, [](auto i) { return i % 3 == 0; }), + }); + // Set a null so that the children will not be loaded lazily. + bits::setNull(c0->mutableRawNulls(), 10); + auto data = maker.rowVector({ + c0, + maker.rowVector({"c"}, {maker.flatVector(11, folly::identity)}), + }); + auto schema = asRowType(data->type()); + auto [writer, reader] = createWriterReader({data}, *pool); + auto spec = std::make_shared(""); + spec->addAllChildFields(*schema); + spec->childByName("c0")->childByName("a")->setFilter( + std::make_unique( + 0, std::numeric_limits::max(), false)); + spec->childByName("c1")->childByName("c")->setFilter( + std::make_unique(0, 4, false)); + RowReaderOptions rowReaderOpts; + rowReaderOpts.setScanSpec(spec); + auto rowReader = reader->createRowReader(rowReaderOpts); + auto result = BaseVector::create(schema, 0, pool); + ASSERT_EQ(rowReader->next(10, result), 10); + ASSERT_EQ(result->size(), 5); + for (int i = 0; i < result->size(); ++i) { + ASSERT_TRUE(result->equalValueAt(data.get(), i, i)) << result->toString(i); + } +} diff --git a/velox/dwio/parquet/reader/ParquetReader.cpp b/velox/dwio/parquet/reader/ParquetReader.cpp index 8672d1ad690c..f113feb04242 100644 --- a/velox/dwio/parquet/reader/ParquetReader.cpp +++ b/velox/dwio/parquet/reader/ParquetReader.cpp @@ -177,6 +177,7 @@ std::shared_ptr ReaderBase::getParquetColumnInfo( ParquetTypeWithId::kNonLeaf, // columnIdx, std::move(name), std::nullopt, + std::nullopt, maxRepeat + 1, maxDefine); } @@ -196,6 +197,7 @@ std::shared_ptr ReaderBase::getParquetColumnInfo( ParquetTypeWithId::kNonLeaf, // columnIdx, std::move(name), std::nullopt, + std::nullopt, maxRepeat, maxDefine); } @@ -220,6 +222,7 @@ std::shared_ptr ReaderBase::getParquetColumnInfo( ParquetTypeWithId::kNonLeaf, // columnIdx, std::move(name), std::nullopt, + std::nullopt, maxRepeat, maxDefine); } else if (children.size() == 2) { @@ -234,6 +237,7 @@ std::shared_ptr ReaderBase::getParquetColumnInfo( ParquetTypeWithId::kNonLeaf, // columnIdx, std::move(name), std::nullopt, + std::nullopt, maxRepeat, maxDefine); } @@ -248,6 +252,7 @@ std::shared_ptr ReaderBase::getParquetColumnInfo( ParquetTypeWithId::kNonLeaf, // columnIdx, std::move(name), std::nullopt, + std::nullopt, maxRepeat, maxDefine); } @@ -260,6 +265,10 @@ std::shared_ptr ReaderBase::getParquetColumnInfo( int32_t type_length = schemaElement.__isset.type_length ? schemaElement.type_length : 0; std::vector> children; + const std::optional logicalType_ = + schemaElement.__isset.logicalType + ? std::optional(schemaElement.logicalType) + : std::nullopt; std::shared_ptr leafTypePtr = std::make_shared( veloxType, @@ -269,6 +278,7 @@ std::shared_ptr ReaderBase::getParquetColumnInfo( columnIdx++, name, schemaElement.type, + logicalType_, maxRepeat, maxDefine, precision, @@ -289,6 +299,7 @@ std::shared_ptr ReaderBase::getParquetColumnInfo( columnIdx++, std::move(name), std::nullopt, + std::nullopt, maxRepeat, maxDefine - 1); } diff --git a/velox/dwio/parquet/reader/ParquetTypeWithId.h b/velox/dwio/parquet/reader/ParquetTypeWithId.h index 9a382194ef06..1fa0d6b3f5d8 100644 --- a/velox/dwio/parquet/reader/ParquetTypeWithId.h +++ b/velox/dwio/parquet/reader/ParquetTypeWithId.h @@ -42,6 +42,7 @@ class ParquetTypeWithId : public dwio::common::TypeWithId { uint32_t column, std::string name, std::optional parquetType, + std::optional logicalType, uint32_t maxRepeat, uint32_t maxDefine, int32_t precision = 0, @@ -50,6 +51,7 @@ class ParquetTypeWithId : public dwio::common::TypeWithId { : TypeWithId(type, std::move(children), id, maxId, column), name_(name), parquetType_(parquetType), + logicalType_(std::move(logicalType)), maxRepeat_(maxRepeat), maxDefine_(maxDefine), precision_(precision), @@ -74,6 +76,7 @@ class ParquetTypeWithId : public dwio::common::TypeWithId { const std::string name_; const std::optional parquetType_; + const std::optional logicalType_; const uint32_t maxRepeat_; const uint32_t maxDefine_; const int32_t precision_; diff --git a/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp b/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp index 5a30c02bb5ac..3c219dccfa8d 100644 --- a/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp +++ b/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp @@ -400,6 +400,47 @@ TEST_F(ParquetTableScanTest, DISABLED_reqArrayLegacy) { "SELECT UNNEST(array[array['a', 'b'], array[], array['c', 'd']])"); } +TEST_F(ParquetTableScanTest, readAsLowerCase) { + auto plan = PlanBuilder(pool_.get()) + .tableScan(ROW({"a"}, {BIGINT()}), {}, "") + .planNode(); + CursorParameters params; + std::shared_ptr executor = + std::make_shared( + std::thread::hardware_concurrency()); + std::shared_ptr queryCtx = + std::make_shared(executor.get()); + std::unordered_map configs = { + {std::string( + connector::hive::HiveConfig::kFileColumnNamesReadAsLowerCase), + "true"}}; + queryCtx->setConnectorConfigOverridesUnsafe( + kHiveConnectorId, std::move(configs)); + params.queryCtx = queryCtx; + params.planNode = plan; + const int numSplitsPerFile = 1; + + bool noMoreSplits = false; + auto addSplits = [&](exec::Task* task) { + if (!noMoreSplits) { + auto const splits = HiveConnectorTestBase::makeHiveConnectorSplits( + {getExampleFilePath("upper.parquet")}, + numSplitsPerFile, + dwio::common::FileFormat::PARQUET); + for (const auto& split : splits) { + task->addSplit("0", exec::Split(split)); + } + task->noMoreSplits("0"); + } + noMoreSplits = true; + }; + auto result = readCursor(params, addSplits); + ASSERT_TRUE(waitForTaskCompletion(result.first->task().get())); + auto vector = makeFlatVector({0, 1}); + auto expected = makeRowVector({"a"}, {vector}); + assertEqualResults(result.second, {expected}); +} + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); folly::init(&argc, &argv, false); diff --git a/velox/examples/CMakeLists.txt b/velox/examples/CMakeLists.txt index 013fa6b5db54..8625d34ce82e 100644 --- a/velox/examples/CMakeLists.txt +++ b/velox/examples/CMakeLists.txt @@ -20,12 +20,12 @@ target_link_libraries(velox_example_simple_functions velox_functions_lib add_executable(velox_example_expression_eval ExpressionEval.cpp) target_link_libraries(velox_example_expression_eval velox_type velox_vector - velox_memory velox_expression) + velox_caching velox_memory velox_expression) add_executable(velox_example_opaque_type OpaqueType.cpp) target_link_libraries(velox_example_opaque_type velox_type velox_vector - velox_expression velox_memory) + velox_caching velox_expression velox_memory) # This is disabled temporarily until we figure out why g++ is crashing linking # it on linux builds. diff --git a/velox/exec/AggregateWindow.cpp b/velox/exec/AggregateWindow.cpp index 054426a317af..54a3d91bd1c0 100644 --- a/velox/exec/AggregateWindow.cpp +++ b/velox/exec/AggregateWindow.cpp @@ -35,10 +35,13 @@ class AggregateWindowFunction : public exec::WindowFunction { const std::string& name, const std::vector& args, const TypePtr& resultType, + bool ignoreNulls, velox::memory::MemoryPool* pool, HashStringAllocator* stringAllocator, const core::QueryConfig& config) : WindowFunction(resultType, pool, stringAllocator) { + VELOX_USER_CHECK( + !ignoreNulls, "Aggregate window functions do not support IGNORE NULLS"); argTypes_.reserve(args.size()); argIndices_.reserve(args.size()); argVectors_.reserve(args.size()); @@ -391,13 +394,19 @@ void registerAggregateWindowFunction(const std::string& name) { [name]( const std::vector& args, const TypePtr& resultType, - bool /*ignoreNulls*/, + bool ignoreNulls, velox::memory::MemoryPool* pool, HashStringAllocator* stringAllocator, const core::QueryConfig& config) -> std::unique_ptr { return std::make_unique( - name, args, resultType, pool, stringAllocator, config); + name, + args, + resultType, + ignoreNulls, + pool, + stringAllocator, + config); }); } } diff --git a/velox/exec/GroupingSet.cpp b/velox/exec/GroupingSet.cpp index abd7ac6c2cc5..e3f616289b87 100644 --- a/velox/exec/GroupingSet.cpp +++ b/velox/exec/GroupingSet.cpp @@ -307,10 +307,15 @@ namespace { void initializeAggregates( const std::vector& aggregates, - RowContainer& rows) { + RowContainer& rows, + bool excludeToIntermediate) { const auto numKeys = rows.keyTypes().size(); - for (auto i = 0; i < aggregates.size(); ++i) { - auto& function = aggregates[i].function; + int i = 0; + for (auto& aggregate : aggregates) { + auto& function = aggregate.function; + if (excludeToIntermediate && function->supportsToIntermediate()) { + continue; + } function->setAllocator(&rows.stringAllocator()); const auto rowColumn = rows.columnAt(numKeys + i); @@ -319,15 +324,19 @@ void initializeAggregates( rowColumn.nullByte(), rowColumn.nullMask(), rows.rowSizeOffset()); + ++i; } } } // namespace -std::vector GroupingSet::accumulators() { +std::vector GroupingSet::accumulators(bool excludeToIntermediate) { std::vector accumulators; accumulators.reserve(aggregates_.size()); for (auto& aggregate : aggregates_) { - accumulators.push_back(Accumulator{aggregate.function.get()}); + if (!excludeToIntermediate || + !aggregate.function->supportsToIntermediate()) { + accumulators.push_back(Accumulator{aggregate.function.get()}); + } } if (sortedAggregations_ != nullptr) { @@ -345,14 +354,14 @@ std::vector GroupingSet::accumulators() { void GroupingSet::createHashTable() { if (ignoreNullKeys_) { table_ = HashTable::createForAggregation( - std::move(hashers_), accumulators(), &pool_); + std::move(hashers_), accumulators(false), &pool_); } else { table_ = HashTable::createForAggregation( - std::move(hashers_), accumulators(), &pool_); + std::move(hashers_), accumulators(false), &pool_); } RowContainer& rows = *table_->rows(); - initializeAggregates(aggregates_, rows); + initializeAggregates(aggregates_, rows, false); auto numColumns = rows.keyTypes().size() + aggregates_.size(); @@ -637,9 +646,6 @@ bool GroupingSet::getOutput( if (table_) { table_->clear(); } - if (remainingInput_) { - addRemainingInput(); - } return false; } extractGroups(folly::Range(groups, numGroups), result); @@ -866,15 +872,16 @@ bool GroupingSet::getOutputWithSpill( mergeRows_ = std::make_unique( keyTypes, !ignoreNullKeys_, - accumulators(), + accumulators(false), std::vector(), false, false, false, false, - &pool_); + &pool_, + table_->rows()->stringAllocatorShared()); - initializeAggregates(aggregates_, *mergeRows_); + initializeAggregates(aggregates_, *mergeRows_, false); // Take ownership of the rows and free the hash table. The table will not be // needed for producing spill output. @@ -978,11 +985,20 @@ void GroupingSet::abandonPartialAggregation() { } } - VELOX_CHECK_EQ(table_->rows()->numRows(), 0) - intermediateRows_ = table_->moveRows(); - intermediateRows_->clear(); - - table_ = nullptr; + VELOX_CHECK_EQ(table_->rows()->numRows(), 0); + intermediateRows_ = std::make_unique( + table_->rows()->keyTypes(), + !ignoreNullKeys_, + accumulators(true), + std::vector(), + false, + false, + false, + false, + &pool_, + table_->rows()->stringAllocatorShared()); + initializeAggregates(aggregates_, *intermediateRows_, true); + table_.reset(); } void GroupingSet::toIntermediate( diff --git a/velox/exec/GroupingSet.h b/velox/exec/GroupingSet.h index dc0e11bad517..05b42a4c60b8 100644 --- a/velox/exec/GroupingSet.h +++ b/velox/exec/GroupingSet.h @@ -191,9 +191,11 @@ class GroupingSet { // groups. void extractSpillResult(const RowVectorPtr& result); - // Return a list of accumulators for 'aggregates_' plus one more accumulator - // for 'sortedAggregations_'. - std::vector accumulators(); + // Return a list of accumulators for 'aggregates_', plus one more accumulator + // for 'sortedAggregations_', and one for each 'distinctAggregations_'. When + // 'excludeToIntermediate' is true, skip the functions that support + // 'toIntermediate'. + std::vector accumulators(bool excludeToIntermediate); std::vector keyChannels_; diff --git a/velox/exec/HashBuild.cpp b/velox/exec/HashBuild.cpp index 5ce6e86d3fad..37e9599f6f85 100644 --- a/velox/exec/HashBuild.cpp +++ b/velox/exec/HashBuild.cpp @@ -777,23 +777,10 @@ bool HashBuild::finishHashBuild() { // https://github.com/facebookincubator/velox/issues/3567 is fixed. const bool allowParallelJoinBuild = !otherTables.empty() && spillPartitions.empty(); - // Inject test value to catch the memory allocations from parallel join - // build. - if (TestValue::enabled()) { - std::vector buildOps; - buildOps.reserve(peers.size()); - for (auto& peer : peers) { - auto* op = peer->findOperator(planNodeId()); - buildOps.push_back(op); - } - TestValue::adjust( - "facebook::velox::exec::HashBuild::prepareJoinTable", &buildOps); - } table_->prepareJoinTable( std::move(otherTables), allowParallelJoinBuild ? operatorCtx_->task()->queryCtx()->executor() : nullptr); - addRuntimeStats(); if (joinBridge_->setHashTable( std::move(table_), diff --git a/velox/exec/HashTable.cpp b/velox/exec/HashTable.cpp index 64af72f58ebe..257781b0316a 100644 --- a/velox/exec/HashTable.cpp +++ b/velox/exec/HashTable.cpp @@ -764,7 +764,7 @@ bool HashTable::canApplyParallelJoinBuild() const { template void HashTable::parallelJoinBuild() { TestValue::adjust( - "facebook::velox::exec::HashTable::parallelJoinBuild", nullptr); + "facebook::velox::exec::HashTable::parallelJoinBuild", this); int32_t numPartitions = 1 + otherTables_.size(); VELOX_CHECK_GT( capacity_ / numPartitions, @@ -799,11 +799,15 @@ void HashTable::parallelJoinBuild() { syncWorkItems(buildSteps, error, true); }); + // The parallel table partitioning step. + std::vector> rowPartitions; + rowPartitions.reserve(numPartitions); for (auto i = 0; i < numPartitions; ++i) { - auto table = i == 0 ? this : otherTables_[i - 1].get(); - partitionSteps.push_back( - std::make_shared>([this, table, numPartitions]() { - partitionRows(*table); + auto* table = i == 0 ? this : otherTables_[i - 1].get(); + rowPartitions.push_back(table->rows()->createRowPartitions(*rows_->pool())); + partitionSteps.push_back(std::make_shared>( + [this, table, rawRowPartitions = rowPartitions.back().get()]() { + partitionRows(*table, *rawRowPartitions); return std::make_unique(true); })); assert(!partitionSteps.empty()); // lint @@ -814,20 +818,23 @@ void HashTable::parallelJoinBuild() { if (error) { std::rethrow_exception(error); } + + // The parallel table building step. std::vector> overflowPerPartition(numPartitions); for (auto i = 0; i < numPartitions; ++i) { - buildSteps.push_back( - std::make_shared>([i, &overflowPerPartition, this]() { - buildJoinPartition(i, overflowPerPartition[i]); + buildSteps.push_back(std::make_shared>( + [this, i, &overflowPerPartition, &rowPartitions]() { + buildJoinPartition(i, rowPartitions, overflowPerPartition[i]); return std::make_unique(true); })); - assert(!buildSteps.empty()); // lint + VELOX_CHECK(!buildSteps.empty()); buildExecutor_->add([step = buildSteps.back()]() { step->prepare(); }); } syncWorkItems(buildSteps, error); if (error) { std::rethrow_exception(error); } + raw_vector hashes; for (auto i = 0; i < numPartitions; ++i) { auto& overflows = overflowPerPartition[i]; @@ -872,7 +879,8 @@ int32_t findPartition( template void HashTable::partitionRows( - HashTable& subtable) { + HashTable& subtable, + RowPartitions& rowPartitions) { constexpr int32_t kBatch = 1024; raw_vector rows(kBatch); raw_vector hashes(kBatch); @@ -891,7 +899,7 @@ void HashTable::partitionRows( partitions[i] = findPartition( index, buildPartitionBounds_.data(), buildPartitionBounds_.size()); } - subtable.rows_->partitions().appendPartitions( + rowPartitions.appendPartitions( folly::Range(partitions.data(), numRows)); } } @@ -899,6 +907,7 @@ void HashTable::partitionRows( template void HashTable::buildJoinPartition( uint8_t partition, + const std::vector>& rowPartitions, std::vector& overflow) { constexpr int32_t kBatch = 1024; raw_vector rows(kBatch); @@ -908,7 +917,7 @@ void HashTable::buildJoinPartition( auto table = i == 0 ? this : otherTables_[i - 1].get(); RowContainerIterator iter; while (auto numRows = table->rows_->listPartitionRows( - iter, partition, kBatch, rows.data())) { + iter, partition, kBatch, *rowPartitions[i], rows.data())) { hashRows(folly::Range(rows.data(), numRows), false, hashes); insertForJoin( rows.data(), diff --git a/velox/exec/HashTable.h b/velox/exec/HashTable.h index 2f907c095e33..6c0788ca83f8 100644 --- a/velox/exec/HashTable.h +++ b/velox/exec/HashTable.h @@ -596,12 +596,17 @@ class HashTable : public BaseHashTable { // Inserts the rows in 'partition' from this and 'otherTables' into 'this'. // The rows that would have gone past the end of the partition are returned in // 'overflow'. - void buildJoinPartition(uint8_t partition, std::vector& overflow); + void buildJoinPartition( + uint8_t partition, + const std::vector>& rowPartitions, + std::vector& overflow); // Assigns a partition to each row of 'subtable' in RowPartitions of // subtable's RowContainer. If 'hashMode_' is kNormalizedKeys, records the // normalized key of each row below the row in its container. - void partitionRows(HashTable& subtable); + void partitionRows( + HashTable& subtable, + RowPartitions& rowPartitions); // Calculates hashes for 'rows' and returns them in 'hashes'. If // 'initNormalizedKeys' is true, the normalized keys are stored diff --git a/velox/exec/Operator.cpp b/velox/exec/Operator.cpp index d0f44c536693..0f3114fbe916 100644 --- a/velox/exec/Operator.cpp +++ b/velox/exec/Operator.cpp @@ -54,7 +54,7 @@ OperatorCtx::createConnectorQueryCtx( driverCtx_->task->queryCtx()->getConnectorConfig(connectorId), std::make_unique( execCtx()->queryCtx(), execCtx()->pool()), - driverCtx_->task->queryCtx()->allocator(), + driverCtx_->task->queryCtx()->cache(), driverCtx_->task->queryCtx()->queryId(), taskId(), planNodeId, diff --git a/velox/exec/PartitionedOutput.cpp b/velox/exec/PartitionedOutput.cpp index 50d301c75b43..20d000ddb6eb 100644 --- a/velox/exec/PartitionedOutput.cpp +++ b/velox/exec/PartitionedOutput.cpp @@ -103,7 +103,7 @@ BlockingReason Destination::flush( PartitionedOutput::PartitionedOutput( int32_t operatorId, - DriverCtx* FOLLY_NONNULL ctx, + DriverCtx* ctx, const std::shared_ptr& planNode) : Operator( ctx, @@ -131,9 +131,12 @@ PartitionedOutput::PartitionedOutput( maxBufferedBytes_(ctx->task->queryCtx() ->queryConfig() .maxPartitionedOutputBufferSize()) { - if (numDestinations_ == 1 || planNode->isBroadcast()) { - VELOX_CHECK(keyChannels_.empty()); - VELOX_CHECK_NULL(partitionFunction_); + if (!planNode->isPartitioned()) { + VELOX_USER_CHECK_EQ(numDestinations_, 1); + } + if (numDestinations_ == 1) { + VELOX_USER_CHECK(keyChannels_.empty()); + VELOX_USER_CHECK_NULL(partitionFunction_); } } diff --git a/velox/exec/PartitionedOutput.h b/velox/exec/PartitionedOutput.h index 16ae62cbf85c..0da3801dceb2 100644 --- a/velox/exec/PartitionedOutput.h +++ b/velox/exec/PartitionedOutput.h @@ -27,7 +27,7 @@ class Destination { Destination( const std::string& taskId, int destination, - memory::MemoryPool* FOLLY_NONNULL pool) + memory::MemoryPool* pool) : taskId_(taskId), destination_(destination), pool_(pool) { setTargetSizePct(); } @@ -52,13 +52,13 @@ class Destination { const RowVectorPtr& output, PartitionedOutputBufferManager& bufferManager, const std::function& bufferReleaseFn, - bool* FOLLY_NONNULL atEnd, - ContinueFuture* FOLLY_NONNULL future); + bool* atEnd, + ContinueFuture* future); BlockingReason flush( PartitionedOutputBufferManager& bufferManager, const std::function& bufferReleaseFn, - ContinueFuture* FOLLY_NULLABLE future); + ContinueFuture* future); bool isFinished() const { return finished_; @@ -91,7 +91,7 @@ class Destination { const std::string taskId_; const int destination_; - memory::MemoryPool* FOLLY_NONNULL const pool_; + memory::MemoryPool* const pool_; uint64_t bytesInCurrent_{0}; std::vector rows_; @@ -130,7 +130,7 @@ class PartitionedOutput : public Operator { PartitionedOutput( int32_t operatorId, - DriverCtx* FOLLY_NONNULL ctx, + DriverCtx* ctx, const std::shared_ptr& planNode); void addInput(RowVectorPtr input) override; @@ -146,7 +146,7 @@ class PartitionedOutput : public Operator { return true; } - BlockingReason isBlocked(ContinueFuture* FOLLY_NONNULL future) override { + BlockingReason isBlocked(ContinueFuture* future) override { if (blockingReason_ != BlockingReason::kNotBlocked) { *future = std::move(future_); blockingReason_ = BlockingReason::kNotBlocked; diff --git a/velox/exec/PartitionedOutputBufferManager.cpp b/velox/exec/PartitionedOutputBufferManager.cpp index 1a7a36dd6e9d..fc124463dcb7 100644 --- a/velox/exec/PartitionedOutputBufferManager.cpp +++ b/velox/exec/PartitionedOutputBufferManager.cpp @@ -242,11 +242,12 @@ PartitionedOutputBuffer::PartitionedOutputBuffer( void PartitionedOutputBuffer::updateOutputBuffers( int numBuffers, bool noMoreBuffers) { - VELOX_CHECK( - !isPartitioned(), - "{} is not supported on {} output buffer", - __FUNCTION__, - kind_); + if (isPartitioned()) { + VELOX_CHECK_EQ(buffers_.size(), numBuffers); + VELOX_CHECK(noMoreBuffers); + noMoreBuffers_ = true; + return; + } std::vector promises; bool isFinished; @@ -476,9 +477,17 @@ bool PartitionedOutputBuffer::isFinished() { } bool PartitionedOutputBuffer::isFinishedLocked() { - if (!isPartitioned() && !noMoreBuffers_) { + // NOTE: for broadcast output buffer, we can only mark it as finished after + // receiving the no more (destination) buffers signal. + if (isBroadcast() && !noMoreBuffers_) { return false; } + if (isArbitrary()) { + VELOX_CHECK_NOT_NULL(arbitraryBuffer_); + if (!arbitraryBuffer_->empty()) { + return false; + } + } for (auto& buffer : buffers_) { if (buffer != nullptr) { return false; diff --git a/velox/exec/RowContainer.cpp b/velox/exec/RowContainer.cpp index c9053d656372..dd2a0ee066bb 100644 --- a/velox/exec/RowContainer.cpp +++ b/velox/exec/RowContainer.cpp @@ -107,14 +107,17 @@ RowContainer::RowContainer( bool isJoinBuild, bool hasProbedFlag, bool hasNormalizedKeys, - memory::MemoryPool* pool) + memory::MemoryPool* pool, + std::shared_ptr stringAllocator) : keyTypes_(keyTypes), nullableKeys_(nullableKeys), - accumulators_(accumulators), isJoinBuild_(isJoinBuild), + accumulators_(accumulators), hasNormalizedKeys_(hasNormalizedKeys), rows_(pool), - stringAllocator_(pool) { + stringAllocator_( + stringAllocator ? stringAllocator + : std::make_shared(pool)) { // Compute the layout of the payload row. The row has keys, null // flags, accumulators, dependent fields. All fields are fixed // width. If variable width data is referenced, this is done with @@ -234,11 +237,14 @@ RowContainer::RowContainer( } } +RowContainer::~RowContainer() { + clear(); +} + char* RowContainer::newRow() { - char* row; - VELOX_DCHECK( - !partitions_, "Rows may not be added after partitions() has been called"); + VELOX_DCHECK(mutable_, "Can't add row into an immutable row container"); ++numRows_; + char* row; if (firstFreeRow_) { row = firstFreeRow_; VELOX_CHECK(bits::isBitSet(row, freeFlagOffset_)); @@ -259,8 +265,11 @@ char* RowContainer::initializeRow(char* row, bool reuse) { auto rows = folly::Range(&row, 1); freeVariableWidthFields(rows); freeAggregates(rows); + } else if (rowSizeOffset_ != 0 && checkFree_) { + // zero out string views so that clear() will not hit uninited data. The + // fastest way is to set the whole row to 0. + ::memset(row, 0, fixedRowSize_); } - if (!nullOffsets_.empty()) { memcpy( row + nullByte(nullOffsets_[0]), @@ -300,7 +309,11 @@ void RowContainer::freeVariableWidthFields(folly::Range rows) { if (!isNullAt(row, column.nullByte(), column.nullMask())) { StringView view = valueAt(row, column.offset()); if (!view.isInline()) { - stringAllocator_.free(HashStringAllocator::headerOf(view.data())); + stringAllocator_->free( + HashStringAllocator::headerOf(view.data())); + if (checkFree_) { + valueAt(row, column.offset()) = StringView(); + } } } } @@ -421,11 +434,11 @@ void RowContainer::storeComplexType( row[nullByte] |= nullMask; return; } - RowSizeTracker tracker(row[rowSizeOffset_], stringAllocator_); - ByteStream stream(&stringAllocator_, false, false); - auto position = stringAllocator_.newWrite(stream); + RowSizeTracker tracker(row[rowSizeOffset_], *stringAllocator_); + ByteStream stream(stringAllocator_.get(), false, false); + auto position = stringAllocator_->newWrite(stream); ContainerRowSerde::serialize(*decoded.base(), decoded.index(index), stream); - stringAllocator_.finishWrite(stream, 0); + stringAllocator_->finishWrite(stream, 0); valueAt(row, offset) = StringView(reinterpret_cast(position.position), stream.size()); } @@ -543,22 +556,22 @@ void RowContainer::hash( } void RowContainer::clear() { - if (usesExternalMemory_) { + const bool sharedStringAllocator = !stringAllocator_.unique(); + if (checkFree_ || sharedStringAllocator || usesExternalMemory_) { constexpr int32_t kBatch = 1000; std::vector rows(kBatch); - RowContainerIterator iter; - for (;;) { - int64_t numRows = listRows(&iter, kBatch, rows.data()); - if (!numRows) { - break; - } - auto rowsData = folly::Range(rows.data(), numRows); - freeAggregates(rowsData); + while (auto numRows = listRows(&iter, kBatch, rows.data())) { + eraseRows(folly::Range(rows.data(), numRows)); } } rows_.clear(); - stringAllocator_.clear(); + if (!sharedStringAllocator) { + if (checkFree_) { + stringAllocator_->checkEmpty(); + } + stringAllocator_->clear(); + } numRows_ = 0; numRowsWithNormalizedKey_ = 0; normalizedKeySize_ = originalNormalizedKeySize_; @@ -618,7 +631,7 @@ std::optional RowContainer::estimateRowSize() const { } int64_t freeBytes = rows_.freeBytes() + fixedRowSize_ * numFreeRows_; int64_t usedSize = rows_.allocatedBytes() - freeBytes + - stringAllocator_.retainedSize() - stringAllocator_.freeSpace(); + stringAllocator_->retainedSize() - stringAllocator_->freeSpace(); int64_t rowSize = usedSize / numRows_; VELOX_CHECK_GT( rowSize, 0, "Estimated row size of the RowContainer must be positive."); @@ -633,7 +646,7 @@ int64_t RowContainer::sizeIncrement( constexpr int32_t kAllocUnit = memory::AllocationTraits::kHugePageSize; int32_t needRows = std::max(0, numRows - numFreeRows_); int64_t needBytes = - std::max(0, variableLengthBytes - stringAllocator_.freeSpace()); + std::max(0, variableLengthBytes - stringAllocator_->freeSpace()); return bits::roundUp(needRows * fixedRowSize_, kAllocUnit) + bits::roundUp(needBytes, kAllocUnit); } @@ -683,28 +696,30 @@ void RowContainer::skip(RowContainerIterator& iter, int32_t numRows) { iter.rowNumber += numRows; } -RowPartitions& RowContainer::partitions() { - if (!partitions_) { - partitions_ = std::make_unique(numRows_, *rows_.pool()); - } - return *partitions_; +std::unique_ptr RowContainer::createRowPartitions( + memory::MemoryPool& pool) { + VELOX_CHECK( + mutable_, "Can only create RowPartitions once from a row container"); + mutable_ = false; + return std::make_unique(numRows_, pool); } int32_t RowContainer::listPartitionRows( RowContainerIterator& iter, uint8_t partition, int32_t maxRows, + const RowPartitions& rowPartitions, char** result) { - if (!numRows_) { - return 0; - } VELOX_CHECK( - partitions_, "partitions() must be called before listPartitionRows()"); + !mutable_, "Can't list partition rows from a mutable row container"); VELOX_CHECK_EQ( - partitions_->size(), numRows_, "All rows must have a partition"); - auto partitionNumberVector = xsimd::batch::broadcast(partition); - auto& allocation = partitions_->allocation(); - auto numRuns = allocation.numRuns(); + rowPartitions.size(), numRows_, "All rows must have a partition"); + if (numRows_ == 0) { + return 0; + } + const auto partitionNumberVector = + xsimd::batch::broadcast(partition); + const auto& allocation = rowPartitions.allocation(); int32_t numResults = 0; while (numResults < maxRows && iter.rowNumber < numRows_) { constexpr int32_t kBatch = xsimd::batch::size; @@ -762,10 +777,10 @@ int32_t RowContainer::listPartitionRows( RowPartitions::RowPartitions(int32_t numRows, memory::MemoryPool& pool) : capacity_(numRows) { - auto numPages = - bits::roundUp(capacity_, memory::AllocationTraits::kPageSize) / - memory::AllocationTraits::kPageSize; - pool.allocateNonContiguous(numPages, allocation_); + const auto numPages = memory::AllocationTraits::numPages(capacity_); + if (numPages > 0) { + pool.allocateNonContiguous(numPages, allocation_); + } } void RowPartitions::appendPartitions(folly::Range partitions) { diff --git a/velox/exec/RowContainer.h b/velox/exec/RowContainer.h index 1af2166dc49a..8b99da63ac8a 100644 --- a/velox/exec/RowContainer.h +++ b/velox/exec/RowContainer.h @@ -186,6 +186,8 @@ class RowContainer { false, // hasNormalizedKey pool) {} + ~RowContainer(); + static int32_t combineAlignments(int32_t a, int32_t b); // 'keyTypes' gives the type of the key of each row. For a group by, @@ -204,6 +206,10 @@ class RowContainer { // into one word for faster comparison. The bulk allocation is done // from 'allocator'. ContainerRowSerde is used for serializing complex // type values into the container. + /// ''stringAllocator' allows sharing the variable length data arena with + /// another RowContainer. this is + // needed for spilling where the same aggregates are used for + // reading one container and merging into another. RowContainer( const std::vector& keyTypes, bool nullableKeys, @@ -213,7 +219,8 @@ class RowContainer { bool isJoinBuild, bool hasProbedFlag, bool hasNormalizedKey, - memory::MemoryPool* FOLLY_NONNULL pool); + memory::MemoryPool* FOLLY_NONNULL pool, + std::shared_ptr stringAllocator = nullptr); // Allocates a new row and initializes possible aggregates to null. char* FOLLY_NONNULL newRow(); @@ -264,6 +271,10 @@ class RowContainer { int32_t columnIndex); HashStringAllocator& stringAllocator() { + return *stringAllocator_; + } + + const std::shared_ptr& stringAllocatorShared() { return stringAllocator_; } @@ -414,7 +425,9 @@ class RowContainer { auto range = rows_.rangeAt(i); auto* data = range.data() + memory::alignmentPadding(range.data(), alignment_); - auto limit = range.size(); + auto limit = range.size() - + (reinterpret_cast(data) - + reinterpret_cast(range.data())); auto row = iter->rowOffset; while (row + rowSize <= limit) { rows[count++] = data + row + @@ -575,7 +588,7 @@ class RowContainer { uint64_t* FOLLY_NONNULL result); uint64_t allocatedBytes() const { - return rows_.allocatedBytes() + stringAllocator_.retainedSize(); + return rows_.allocatedBytes() + stringAllocator_->retainedSize(); } // Returns the number of fixed size rows that can be allocated @@ -584,7 +597,7 @@ class RowContainer { std::pair freeSpace() const { return std::make_pair( rows_.freeBytes() / fixedRowSize_ + numFreeRows_, - stringAllocator_.freeSpace()); + stringAllocator_->freeSpace()); } // Returns the average size of rows in bytes stored in this container. @@ -614,7 +627,7 @@ class RowContainer { } memory::MemoryPool* FOLLY_NONNULL pool() const { - return stringAllocator_.pool(); + return stringAllocator_->pool(); } // Returns the types of all non-aggregate columns of 'this', keys first. @@ -631,7 +644,7 @@ class RowContainer { } const HashStringAllocator& stringAllocator() const { - return stringAllocator_; + return *stringAllocator_; } // Checks that row and free row counts match and that free list @@ -643,26 +656,32 @@ class RowContainer { return (row[nullByte] & nullMask) != 0; } - /// Retrieves rows from 'iterator' whose partition equals - /// 'partition'. Writes up to 'maxRows' pointers to the rows in - /// 'result'. Returns the number of rows retrieved, 0 when no more - /// rows are found. 'iterator' is expected to be in initial state - /// on first call. + /// Creates a container to store a partition number for each row in this row + /// container. This is used by parallel join build which is responsible for + /// filling this. This function also marks this row container as immutable + /// after this call, we expect the user only call this once. + std::unique_ptr createRowPartitions(memory::MemoryPool& pool); + + /// Retrieves rows from 'iterator' whose partition equals 'partition'. Writes + /// up to 'maxRows' pointers to the rows in 'result'. 'rowPartitions' contains + /// the partition number of each row in this container. The function returns + /// the number of rows retrieved, 0 when no more rows are found. 'iterator' is + /// expected to be in initial state on first call. int32_t listPartitionRows( RowContainerIterator& iterator, uint8_t partition, int32_t maxRows, + const RowPartitions& rowPartitions, char* FOLLY_NONNULL* FOLLY_NONNULL result); - /// Returns a container with a partition number for each row. This - /// is created on first use. The caller is responsible for filling - /// this. - RowPartitions& partitions(); - /// Advances 'iterator' by 'numRows'. The current row after skip is /// in iter.currentRow(). This is null if past end. Public for testing. void skip(RowContainerIterator& iterator, int32_t numRows); + bool testingMutable() const { + return mutable_; + } + private: // Offset of the pointer to the next free row on a free row. static constexpr int32_t kNextFreeOffset = 0; @@ -758,8 +777,8 @@ class RowContainer { } *reinterpret_cast(row + offset) = decoded.valueAt(index); if constexpr (std::is_same_v) { - RowSizeTracker tracker(row[rowSizeOffset_], stringAllocator_); - stringAllocator_.copyMultipart(row, offset); + RowSizeTracker tracker(row[rowSizeOffset_], *stringAllocator_); + stringAllocator_->copyMultipart(row, offset); } } @@ -772,8 +791,8 @@ class RowContainer { using T = typename TypeTraits::NativeType; *reinterpret_cast(group + offset) = decoded.valueAt(index); if constexpr (std::is_same_v) { - RowSizeTracker tracker(group[rowSizeOffset_], stringAllocator_); - stringAllocator_.copyMultipart(group, offset); + RowSizeTracker tracker(group[rowSizeOffset_], *stringAllocator_); + stringAllocator_->copyMultipart(group, offset); } } @@ -1085,8 +1104,16 @@ class RowContainer { // Free any aggregates associated with the 'rows'. void freeAggregates(folly::Range rows); + const bool checkFree_ = false; + const std::vector keyTypes_; const bool nullableKeys_; + const bool isJoinBuild_; + + // Indicates if we can add new row to this row container. It is set to false + // after user calls 'getRowPartitions()' to create 'rowPartitions' object for + // parallel join build. + bool mutable_{true}; std::vector accumulators_; @@ -1095,7 +1122,6 @@ class RowContainer { // to 'typeKinds_' and 'rowColumns_'. std::vector types_; std::vector typeKinds_; - const bool isJoinBuild_; int32_t nextOffset_ = 0; // Bit position of null bit in the row. 0 if no null flag. Order is keys, // accumulators, dependent. @@ -1134,10 +1160,7 @@ class RowContainer { uint64_t numFreeRows_ = 0; memory::AllocationPool rows_; - HashStringAllocator stringAllocator_; - - // Partition number for each row. Used only in parallel hash join build. - std::unique_ptr partitions_; + std::shared_ptr stringAllocator_; int alignment_ = 1; }; diff --git a/velox/exec/RowNumber.cpp b/velox/exec/RowNumber.cpp index 98c7f12179a5..f6da535982d9 100644 --- a/velox/exec/RowNumber.cpp +++ b/velox/exec/RowNumber.cpp @@ -27,7 +27,8 @@ RowNumber::RowNumber( operatorId, rowNumberNode->id(), "RowNumber"), - limit_{rowNumberNode->limit()} { + limit_{rowNumberNode->limit()}, + generateRowNumber_{rowNumberNode->generateRowNumber()} { const auto& inputType = rowNumberNode->sources()[0]->outputType(); const auto& keys = rowNumberNode->partitionKeys(); const auto numKeys = keys.size(); @@ -53,8 +54,10 @@ RowNumber::RowNumber( identityProjections_.emplace_back(i, i); } - resultProjections_.emplace_back(0, inputType->size()); - results_.resize(1); + if (generateRowNumber_) { + resultProjections_.emplace_back(0, inputType->size()); + results_.resize(1); + } } void RowNumber::addInput(RowVectorPtr input) { @@ -104,8 +107,11 @@ RowVectorPtr RowNumber::getOutput() { rawMapping = mapping->asMutable(); } - // Compute row numbers. - auto& rowNumbers = getOrCreateRowNumberVector(numInput); + // Compute row numbers if needed. + FlatVector* rowNumbers = nullptr; + if (generateRowNumber_) { + rowNumbers = &getOrCreateRowNumberVector(numInput); + } for (auto i = 0; i < numInput; ++i) { auto* partition = lookup_->hits[i]; @@ -119,7 +125,9 @@ RowVectorPtr RowNumber::getOutput() { rawMapping[index++] = i; } - rowNumbers.set(i, rowNumber); + if (generateRowNumber_) { + rowNumbers->set(i, rowNumber); + } setNumRows(partition, rowNumber); } @@ -155,10 +163,11 @@ RowVectorPtr RowNumber::getOutputForSinglePartition() { numOutput = numInput; } - auto& rowNumbers = getOrCreateRowNumberVector(numOutput); - - for (auto i = 0; i < numOutput; ++i) { - rowNumbers.set(i, ++numTotalInput_); + if (generateRowNumber_) { + auto& rowNumbers = getOrCreateRowNumberVector(numOutput); + for (auto i = 0; i < numOutput; ++i) { + rowNumbers.set(i, ++numTotalInput_); + } } auto output = fillOutput(numOutput, nullptr); diff --git a/velox/exec/RowNumber.h b/velox/exec/RowNumber.h index f42ef6f97441..7ecf963b69cf 100644 --- a/velox/exec/RowNumber.h +++ b/velox/exec/RowNumber.h @@ -53,6 +53,7 @@ class RowNumber : public Operator { FlatVector& getOrCreateRowNumberVector(vector_size_t size); const std::optional limit_; + const bool generateRowNumber_; /// Hash table to store number of rows seen so far per partition. Not used if /// there are no partitioning keys. diff --git a/velox/exec/SetAccumulator.h b/velox/exec/SetAccumulator.h index bb373ba06ab6..48d81bc2ef66 100644 --- a/velox/exec/SetAccumulator.h +++ b/velox/exec/SetAccumulator.h @@ -89,7 +89,10 @@ struct SetAccumulator { return index - offset; } - void free(HashStringAllocator& allocator) {} + void free(HashStringAllocator& allocator) { + using UT = decltype(uniqueValues); + uniqueValues.~UT(); + } }; /// Maintains a set of unique strings. @@ -146,6 +149,8 @@ struct StringViewSetAccumulator { void free(HashStringAllocator& allocator) { strings.free(allocator); + using Base = decltype(base); + base.~Base(); } }; @@ -214,6 +219,8 @@ struct ComplexTypeSetAccumulator { void free(HashStringAllocator& allocator) { values.free(allocator); + using Base = decltype(base); + base.~Base(); } }; diff --git a/velox/exec/SortedAggregations.cpp b/velox/exec/SortedAggregations.cpp index b20537d3bd84..3bc43b460232 100644 --- a/velox/exec/SortedAggregations.cpp +++ b/velox/exec/SortedAggregations.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ #include "velox/exec/SortedAggregations.h" +#include "velox/common/base/RawVector.h" namespace facebook::velox::exec { @@ -270,6 +271,7 @@ void SortedAggregations::extractValues( const RowVectorPtr& result) { // TODO Identify aggregates with same order by and sort once. + raw_vector temp; SelectivityVector rows; for (auto i = 0; i < aggregates_.size(); ++i) { const auto& aggregate = *aggregates_[i]; @@ -300,6 +302,11 @@ void SortedAggregations::extractValues( // Release memory back to HashStringAllocator to allow next aggregate to // re-use it. aggregate.function->destroy(groups); + // Overwrite empty groups over the destructed groups to keep the container + // in a well formed state. + aggregate.function->initializeNewGroups( + groups.data(), + folly::Range(iota(groups.size(), temp), groups.size())); } } diff --git a/velox/exec/Spill.cpp b/velox/exec/Spill.cpp index 67b19d0bcb9c..253199891951 100644 --- a/velox/exec/Spill.cpp +++ b/velox/exec/Spill.cpp @@ -27,7 +27,9 @@ namespace facebook::velox::exec { // nanosecond precision, we use this serde option to ensure the serializer // preserves precision. static const serializer::presto::PrestoVectorSerde::PrestoOptions - kDefaultSerdeOptions(/*useLosslessTimestamp*/ true); + kDefaultSerdeOptions( + /*useLosslessTimestamp*/ true, + common::CompressionKind::CompressionKind_NONE); std::atomic SpillFile::ordinalCounter_; diff --git a/velox/exec/StreamingAggregation.cpp b/velox/exec/StreamingAggregation.cpp index 0a5561ec5918..4d160839f91b 100644 --- a/velox/exec/StreamingAggregation.cpp +++ b/velox/exec/StreamingAggregation.cpp @@ -123,11 +123,7 @@ StreamingAggregation::StreamingAggregation( } void StreamingAggregation::close() { - for (int32_t i = 0; i < aggregates_.size(); ++i) { - if (aggregates_[i]->accumulatorUsesExternalMemory()) { - aggregates_[i]->destroy(folly::Range(groups_.data(), groups_.size())); - } - } + rows_->clear(); Operator::close(); } diff --git a/velox/exec/TaskStats.h b/velox/exec/TaskStats.h index b6aa2298a884..ac64dd4d769f 100644 --- a/velox/exec/TaskStats.h +++ b/velox/exec/TaskStats.h @@ -88,9 +88,9 @@ struct TaskStats { /// Output buffer's memory utilization ratio measured as /// current buffer usage / max buffer size - double outputBufferUtilization; + double outputBufferUtilization{0}; /// Indicates if output buffer is over-utilized and thus blocks the producers. - bool outputBufferOverutilized; + bool outputBufferOverutilized{false}; }; } // namespace facebook::velox::exec diff --git a/velox/exec/tests/AggregationFuzzer.cpp b/velox/exec/tests/AggregationFuzzer.cpp index 2e86d9fc079d..4f25173ff016 100644 --- a/velox/exec/tests/AggregationFuzzer.cpp +++ b/velox/exec/tests/AggregationFuzzer.cpp @@ -64,6 +64,11 @@ DEFINE_string( "Directory path for persistence of data and SQL when fuzzer fails for " "future reproduction. Empty string disables this feature."); +DEFINE_bool( + enable_window_duck_verification, + false, + "When true, the results of the window aggregation will be compared to duckdb results"); + DEFINE_bool( persist_and_run_once, false, @@ -164,7 +169,8 @@ class AggregationFuzzer { const std::vector& sortingKeys, const std::vector& aggregates, const std::vector& input, - bool customVerification); + bool customVerification, + bool enableWindowDuckVerification); std::optional computeDuckWindow( const std::vector& partitionKeys, @@ -586,7 +592,12 @@ void AggregationFuzzer::go() { auto input = generateInputDataWithRowNumber(argNames, argTypes); verifyWindow( - partitionKeys, sortingKeys, {call}, input, customVerification); + partitionKeys, + sortingKeys, + {call}, + input, + customVerification, + FLAGS_enable_window_duck_verification); } else { // 20% of times use mask. std::vector masks; @@ -958,7 +969,8 @@ void AggregationFuzzer::verifyWindow( const std::vector& sortingKeys, const std::vector& aggregates, const std::vector& input, - bool customVerification) { + bool customVerification, + bool enableWindowDuckVerification) { std::stringstream frame; if (!partitionKeys.empty()) { frame << "partition by " << folly::join(", ", partitionKeys); @@ -981,7 +993,8 @@ void AggregationFuzzer::verifyWindow( ++stats_.numFailed; } - if (!customVerification && resultOrError.result) { + if (!customVerification && resultOrError.result && + enableWindowDuckVerification) { if (auto expectedResult = computeDuckWindow( partitionKeys, sortingKeys, aggregates, input, plan)) { ++stats_.numDuckVerified; diff --git a/velox/exec/tests/PartitionedOutputBufferManagerTest.cpp b/velox/exec/tests/PartitionedOutputBufferManagerTest.cpp index eb70babd1c4c..af7e725a9db6 100644 --- a/velox/exec/tests/PartitionedOutputBufferManagerTest.cpp +++ b/velox/exec/tests/PartitionedOutputBufferManagerTest.cpp @@ -149,22 +149,22 @@ class PartitionedOutputBufferManagerTest : public testing::Test { &receivedData]( std::vector> pages, int64_t inSequence) { - EXPECT_FALSE(receivedData) << "for destination " << destination; - EXPECT_EQ(pages.size(), expectedGroups) + ASSERT_FALSE(receivedData) << "for destination " << destination; + ASSERT_EQ(pages.size(), expectedGroups) << "for destination " << destination; for (int i = 0; i < pages.size(); ++i) { if (i == pages.size() - 1) { - EXPECT_EQ(expectedEndMarker, pages[i] == nullptr) + ASSERT_EQ(expectedEndMarker, pages[i] == nullptr) << "for destination " << destination; } else { - EXPECT_TRUE(pages[i] != nullptr) + ASSERT_TRUE(pages[i] != nullptr) << "for destination " << destination; } } - EXPECT_EQ(inSequence, sequence) << "for destination " << destination; + ASSERT_EQ(inSequence, sequence) << "for destination " << destination; receivedData = true; })); - EXPECT_TRUE(receivedData) << "for destination " << destination; + ASSERT_TRUE(receivedData) << "for destination " << destination; } void fetchOne( @@ -562,20 +562,19 @@ TEST_F(PartitionedOutputBufferManagerTest, basicPartitioned) { taskId, rowType_, PartitionedOutputNode::Kind::kPartitioned, 5, 1); verifyOutputBuffer(task, OutputBufferStatus::kInitiated); - // Partitioned output buffer doesn't allow to update output buffers once - // created. - VELOX_ASSERT_THROW( - bufferManager_->updateOutputBuffers(taskId, 5 + 1, true), - "updateOutputBuffers is not supported on PARTITIONED output buffer"); + // Duplicateb update buffers with the same settings are allowed and ignored. + ASSERT_TRUE(bufferManager_->updateOutputBuffers(taskId, 5, true)); + ASSERT_FALSE(bufferManager_->isFinished(taskId)); + // Partitioned output buffer doesn't allow to update with different number of + // output buffers once created. VELOX_ASSERT_THROW( - bufferManager_->updateOutputBuffers(taskId, 5 + 1, false), - "updateOutputBuffers is not supported on PARTITIONED output buffer"); + bufferManager_->updateOutputBuffers(taskId, 5 + 1, true), ""); + // Partitioned output buffer doesn't expect more output buffers once created. + VELOX_ASSERT_THROW(bufferManager_->updateOutputBuffers(taskId, 5, false), ""); VELOX_ASSERT_THROW( - bufferManager_->updateOutputBuffers(taskId, 5 - 1, true), - "updateOutputBuffers is not supported on PARTITIONED output buffer"); + bufferManager_->updateOutputBuffers(taskId, 5 - 1, true), ""); VELOX_ASSERT_THROW( - bufferManager_->updateOutputBuffers(taskId, 5 - 1, false), - "updateOutputBuffers is not supported on PARTITIONED output buffer"); + bufferManager_->updateOutputBuffers(taskId, 5 - 1, false), ""); // - enqueue one group per destination // - fetch and ask one group per destination @@ -761,8 +760,10 @@ TEST_F(PartitionedOutputBufferManagerTest, basicArbitrary) { fetchOneAndAck(taskId, numDestinations - 1, 0); ackedSeqbyDestination[numDestinations - 1] = 1; - bufferManager_->updateOutputBuffers(taskId, numDestinations, true); - VELOX_ASSERT_THROW(fetchOneAndAck(taskId, numDestinations, 0), ""); + bufferManager_->updateOutputBuffers(taskId, numDestinations - 1, false); + VELOX_ASSERT_THROW( + fetchOneAndAck(taskId, numDestinations - 1, 0), + "(0 vs. 1) Get received for an already acknowledged item"); receivedData = false; registerForData(taskId, numDestinations - 2, 0, 1, receivedData); @@ -772,13 +773,18 @@ TEST_F(PartitionedOutputBufferManagerTest, basicArbitrary) { ackedSeqbyDestination[numDestinations - 2] = 1; noMoreData(taskId); + EXPECT_FALSE(bufferManager_->isFinished(taskId)); EXPECT_TRUE(task->isRunning()); for (int i = 0; i < numDestinations; ++i) { fetchEndMarker(taskId, i, ackedSeqbyDestination[i]); } EXPECT_TRUE(bufferManager_->isFinished(taskId)); - EXPECT_FALSE(task->isRunning()); + + // NOTE: arbitrary buffer finish condition doesn't depend on no more + // (destination )buffers update flag. + bufferManager_->updateOutputBuffers(taskId, numDestinations, true); + EXPECT_TRUE(bufferManager_->isFinished(taskId)); bufferManager_->removeTask(taskId); EXPECT_TRUE(task->isFinished()); @@ -919,8 +925,7 @@ TEST_P(AllPartitionedOutputBufferManagerTest, outputBufferUtilization) { const auto destination = 0; auto task = initializeTask(taskId, rowType_, kind_, 1, 1); verifyOutputBuffer(task, OutputBufferStatus::kInitiated); - if (kind_ != - facebook::velox::core::PartitionedOutputNode::Kind::kPartitioned) { + if (kind_ == facebook::velox::core::PartitionedOutputNode::Kind::kBroadcast) { bufferManager_->updateOutputBuffers(taskId, destination, true); } diff --git a/velox/exec/tests/PlanNodeSerdeTest.cpp b/velox/exec/tests/PlanNodeSerdeTest.cpp index 5c6243bb6d82..c8a9ce7a84f7 100644 --- a/velox/exec/tests/PlanNodeSerdeTest.cpp +++ b/velox/exec/tests/PlanNodeSerdeTest.cpp @@ -413,6 +413,7 @@ TEST_F(PlanNodeSerdeTest, window) { } TEST_F(PlanNodeSerdeTest, rowNumber) { + // Test with emitting the row number. auto plan = PlanBuilder().values({data_}).rowNumber({}).planNode(); testSerde(plan); @@ -420,6 +421,26 @@ TEST_F(PlanNodeSerdeTest, rowNumber) { testSerde(plan); plan = PlanBuilder().values({data_}).rowNumber({"c1", "c2"}, 10).planNode(); + testSerde(plan); + + // Test without emitting the row number. + plan = PlanBuilder() + .values({data_}) + .rowNumber({}, std::nullopt, false) + .planNode(); + testSerde(plan); + + plan = PlanBuilder() + .values({data_}) + .rowNumber({"c2", "c0"}, std::nullopt, false) + .planNode(); + testSerde(plan); + + plan = PlanBuilder() + .values({data_}) + .rowNumber({"c1", "c2"}, 10, false) + .planNode(); + testSerde(plan); } TEST_F(PlanNodeSerdeTest, scan) { diff --git a/velox/exec/tests/PlanNodeToStringTest.cpp b/velox/exec/tests/PlanNodeToStringTest.cpp index cd6e34fc0ec2..baa7b4743181 100644 --- a/velox/exec/tests/PlanNodeToStringTest.cpp +++ b/velox/exec/tests/PlanNodeToStringTest.cpp @@ -701,6 +701,7 @@ TEST_F(PlanNodeToStringTest, window) { } TEST_F(PlanNodeToStringTest, rowNumber) { + // Emit row number. auto plan = PlanBuilder().tableScan(ROW({"a"}, {VARCHAR()})).rowNumber({}).planNode(); @@ -709,6 +710,16 @@ TEST_F(PlanNodeToStringTest, rowNumber) { "-- RowNumber[] -> a:VARCHAR, row_number:BIGINT\n", plan->toString(true, false)); + // Dont' emit row number. + plan = PlanBuilder() + .tableScan(ROW({"a"}, {VARCHAR()})) + .rowNumber({}, std::nullopt, false) + .planNode(); + + ASSERT_EQ("-- RowNumber\n", plan->toString()); + ASSERT_EQ("-- RowNumber[] -> a:VARCHAR\n", plan->toString(true, false)); + + // Emit row number. plan = PlanBuilder() .tableScan(ROW({"a", "b"}, {BIGINT(), VARCHAR()})) .rowNumber({"a", "b"}) @@ -719,6 +730,18 @@ TEST_F(PlanNodeToStringTest, rowNumber) { "-- RowNumber[partition by (a, b)] -> a:BIGINT, b:VARCHAR, row_number:BIGINT\n", plan->toString(true, false)); + // Don't emit row number. + plan = PlanBuilder() + .tableScan(ROW({"a", "b"}, {BIGINT(), VARCHAR()})) + .rowNumber({"a", "b"}, std::nullopt, false) + .planNode(); + + ASSERT_EQ("-- RowNumber\n", plan->toString()); + ASSERT_EQ( + "-- RowNumber[partition by (a, b)] -> a:BIGINT, b:VARCHAR\n", + plan->toString(true, false)); + + // Emit row number. plan = PlanBuilder() .tableScan(ROW({"a", "b"}, {BIGINT(), VARCHAR()})) .rowNumber({"b"}, 10) @@ -728,6 +751,17 @@ TEST_F(PlanNodeToStringTest, rowNumber) { ASSERT_EQ( "-- RowNumber[partition by (b) limit 10] -> a:BIGINT, b:VARCHAR, row_number:BIGINT\n", plan->toString(true, false)); + + // Don't emit row number. + plan = PlanBuilder() + .tableScan(ROW({"a", "b"}, {BIGINT(), VARCHAR()})) + .rowNumber({"b"}, 10, false) + .planNode(); + + ASSERT_EQ("-- RowNumber\n", plan->toString()); + ASSERT_EQ( + "-- RowNumber[partition by (b) limit 10] -> a:BIGINT, b:VARCHAR\n", + plan->toString(true, false)); } TEST_F(PlanNodeToStringTest, topNRowNumber) { diff --git a/velox/exec/tests/RowContainerTest.cpp b/velox/exec/tests/RowContainerTest.cpp index 1697e9efdbe2..868ac9427325 100644 --- a/velox/exec/tests/RowContainerTest.cpp +++ b/velox/exec/tests/RowContainerTest.cpp @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "velox/common/base/tests/GTestUtils.h" #include "velox/exec/Aggregate.h" #include "velox/exec/VectorHasher.h" #include "velox/exec/tests/utils/RowContainerTestBase.h" @@ -957,8 +958,8 @@ TEST_F(RowContainerTest, compareDouble) { } TEST_F(RowContainerTest, partition) { - // We assign an arbitrary partition number to each row and iterate - // over the rows a partition at a time. + // We assign an arbitrary partition number to each row and iterate over the + // rows a partition at a time. constexpr int32_t kNumRows = 100019; constexpr uint8_t kNumPartitions = 16; auto batch = makeDataset( @@ -986,7 +987,32 @@ TEST_F(RowContainerTest, partition) { } } - auto& partitions = data->partitions(); + // Expect throws before we get row partitions from this row container. + for (auto partition = 0; partition < kNumPartitions; ++partition) { + char* dummyBuffer; + RowPartitions dummyRowPartitions(data->numRows(), *pool_); + VELOX_ASSERT_THROW( + data->listPartitionRows( + iter, + partition, + 1'000, /* maxRows */ + dummyRowPartitions, + &dummyBuffer), + "Can't list partition rows from a mutable row container"); + } + + auto partitions = data->createRowPartitions(*pool_); + ASSERT_FALSE(data->testingMutable()); + // Verify we can only get row partitions once from a row container. + VELOX_ASSERT_THROW( + data->createRowPartitions(*pool_), + "Can only create RowPartitions once from a row container"); + // Verify we can't insert new row into a immutable row container. +#ifndef NDEBUG + VELOX_ASSERT_THROW( + data->newRow(), "Can't add row into an immutable row container") +#endif + std::vector rowPartitions(kNumRows); // Assign a partition to each row based on modulo of first column. std::vector> partitionRows(kNumPartitions); @@ -997,7 +1023,7 @@ TEST_F(RowContainerTest, partition) { rowPartitions[i] = partition; partitionRows[partition].push_back(rows[i]); } - partitions.appendPartitions( + partitions->appendPartitions( folly::Range(rowPartitions.data(), kNumRows)); for (auto partition = 0; partition < kNumPartitions; ++partition) { std::vector result(partitionRows[partition].size() + 10); @@ -1006,7 +1032,11 @@ TEST_F(RowContainerTest, partition) { int32_t resultBatch = 1; // Read the rows in multiple batches. while (auto numResults = data->listPartitionRows( - iter, partition, resultBatch, result.data() + numFound)) { + iter, + partition, + resultBatch, + *partitions, + result.data() + numFound)) { numFound += numResults; resultBatch += 13; } @@ -1016,6 +1046,17 @@ TEST_F(RowContainerTest, partition) { } } +TEST_F(RowContainerTest, partitionWithEmptyRowContainer) { + auto rowType = ROW( + {{"int_val", INTEGER()}, + {"long_val", BIGINT()}, + {"string_val", VARCHAR()}}); + auto rowContainer = + std::make_unique(rowType->children(), pool_.get()); + auto partitions = rowContainer->createRowPartitions(*pool_); + ASSERT_EQ(partitions->size(), 0); +} + TEST_F(RowContainerTest, probedFlag) { auto rowContainer = std::make_unique( std::vector{BIGINT()}, // keyTypes diff --git a/velox/exec/tests/RowNumberTest.cpp b/velox/exec/tests/RowNumberTest.cpp index 700cd260e78e..22fee689601b 100644 --- a/velox/exec/tests/RowNumberTest.cpp +++ b/velox/exec/tests/RowNumberTest.cpp @@ -28,11 +28,21 @@ TEST_F(RowNumberTest, basic) { createDuckDbTable({data}); - // No limit. + // No limit, emit row numbers. auto plan = PlanBuilder().values({data}).rowNumber({"c0"}).planNode(); assertQuery(plan, "SELECT *, row_number() over (partition by c0) FROM tmp"); + // No limit, don't emit row numbers. + plan = PlanBuilder() + .values({data}) + .rowNumber({"c0"}, std::nullopt, false) + .planNode(); + assertQuery( + plan, + "SELECT c0, c1 FROM (SELECT *, row_number() over (partition by c0) as rn FROM tmp)"); + auto testLimit = [&](int32_t limit) { + // Limit, emit row numbers. auto plan = PlanBuilder().values({data}).rowNumber({"c0"}, limit).planNode(); assertQuery( @@ -41,6 +51,16 @@ TEST_F(RowNumberTest, basic) { "SELECT * FROM (SELECT *, row_number() over (partition by c0) as rn FROM tmp) " "WHERE rn <= {}", limit)); + + // Limit, don't emit row numbers. + plan = + PlanBuilder().values({data}).rowNumber({"c0"}, limit, false).planNode(); + assertQuery( + plan, + fmt::format( + "SELECT c0, c1 FROM (SELECT *, row_number() over (partition by c0) as rn FROM tmp) " + "WHERE rn <= {}", + limit)); }; testLimit(1); @@ -55,11 +75,20 @@ TEST_F(RowNumberTest, noPartitionKeys) { createDuckDbTable({data, data}); - // No limit. + // No limit, emit row numbers. auto plan = PlanBuilder().values({data, data}).rowNumber({}).planNode(); assertQuery(plan, "SELECT *, row_number() over () FROM tmp"); + // No limit, don't emit row numbers. + plan = PlanBuilder() + .values({data, data}) + .rowNumber({}, std::nullopt, false) + .planNode(); + assertQuery( + plan, "SELECT c0 FROM (SELECT *, row_number() over () as rn FROM tmp)"); + auto testLimit = [&](int32_t limit) { + // Emit row numbers. auto plan = PlanBuilder().values({data, data}).rowNumber({}, limit).planNode(); assertQuery( @@ -68,6 +97,18 @@ TEST_F(RowNumberTest, noPartitionKeys) { "SELECT * FROM (SELECT *, row_number() over () as rn FROM tmp) " "WHERE rn <= {}", limit)); + + // Don't emit row numbers. + plan = PlanBuilder() + .values({data, data}) + .rowNumber({}, limit, false) + .planNode(); + assertQuery( + plan, + fmt::format( + "SELECT c0 FROM (SELECT *, row_number() over () as rn FROM tmp) " + "WHERE rn <= {}", + limit)); }; testLimit(1); @@ -82,11 +123,21 @@ TEST_F(RowNumberTest, largeInput) { createDuckDbTable({data, data}); - // No limit. + // No limit, emit row numbers. auto plan = PlanBuilder().values({data, data}).rowNumber({"c0"}).planNode(); assertQuery(plan, "SELECT *, row_number() over (partition by c0) FROM tmp"); + // No limit, don't emit row numbers. + plan = PlanBuilder() + .values({data, data}) + .rowNumber({"c0"}, std::nullopt, false) + .planNode(); + assertQuery( + plan, + "SELECT c0, c1 FROM (SELECT *, row_number() over (partition by c0) as rn FROM tmp)"); + auto testLimit = [&](int32_t limit) { + // Emit row numbers. auto plan = PlanBuilder().values({data, data}).rowNumber({"c0"}, limit).planNode(); assertQuery( @@ -95,6 +146,18 @@ TEST_F(RowNumberTest, largeInput) { "SELECT * FROM (SELECT *, row_number() over (partition by c0) as rn FROM tmp) " "WHERE rn <= {}", limit)); + + // Don't emit row numbers. + plan = PlanBuilder() + .values({data, data}) + .rowNumber({"c0"}, limit, false) + .planNode(); + assertQuery( + plan, + fmt::format( + "SELECT c0, c1 FROM (SELECT *, row_number() over (partition by c0) as rn FROM tmp) " + "WHERE rn <= {}", + limit)); }; testLimit(1); diff --git a/velox/exec/tests/StreamingAggregationTest.cpp b/velox/exec/tests/StreamingAggregationTest.cpp index e880a9540a31..36858e523f87 100644 --- a/velox/exec/tests/StreamingAggregationTest.cpp +++ b/velox/exec/tests/StreamingAggregationTest.cpp @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "velox/exec/tests/utils/AssertQueryBuilder.h" #include "velox/exec/tests/utils/OperatorTestBase.h" #include "velox/exec/tests/utils/PlanBuilder.h" #include "velox/exec/tests/utils/SumNonPODAggregate.h" @@ -27,18 +28,6 @@ class StreamingAggregationTest : public OperatorTestBase { registerSumNonPODAggregate("sumnonpod", 64); } - CursorParameters makeCursorParameters( - const std::shared_ptr& planNode, - uint32_t preferredOutputBatchSize) { - CursorParameters params; - params.planNode = planNode; - params.queryCtx = std::make_shared(executor_.get()); - params.queryCtx->testingOverrideConfigUnsafe( - {{core::QueryConfig::kPreferredOutputBatchRows, - std::to_string(preferredOutputBatchSize)}}); - return params; - } - void testAggregation( const std::vector& keys, uint32_t outputBatchSize = 1'024) { @@ -67,11 +56,14 @@ class StreamingAggregationTest : public OperatorTestBase { .finalAggregation() .planNode(); - assertQuery( - makeCursorParameters(plan, outputBatchSize), - "SELECT c0, count(1), min(c1), max(c1), sum(c1), sum(1)" - " , approx_quantile(c1, 0.95) " - "FROM tmp GROUP BY 1"); + AssertQueryBuilder(plan, duckDbQueryRunner_) + .config( + core::QueryConfig::kPreferredOutputBatchRows, + std::to_string(outputBatchSize)) + .assertResults( + "SELECT c0, count(1), min(c1), max(c1), sum(c1), sum(1)" + " , approx_quantile(c1, 0.95) " + "FROM tmp GROUP BY 1"); EXPECT_EQ(NonPODInt64::constructed, NonPODInt64::destructed); @@ -85,9 +77,12 @@ class StreamingAggregationTest : public OperatorTestBase { .finalAggregation() .planNode(); - assertQuery( - makeCursorParameters(plan, outputBatchSize), - "SELECT c0, count(1), min(c1), max(c1), sum(c1), sum(1) FROM tmp GROUP BY 1"); + AssertQueryBuilder(plan, duckDbQueryRunner_) + .config( + core::QueryConfig::kPreferredOutputBatchRows, + std::to_string(outputBatchSize)) + .assertResults( + "SELECT c0, count(1), min(c1), max(c1), sum(c1), sum(1) FROM tmp GROUP BY 1"); EXPECT_EQ(NonPODInt64::constructed, NonPODInt64::destructed); @@ -103,11 +98,14 @@ class StreamingAggregationTest : public OperatorTestBase { .finalAggregation() .planNode(); - assertQuery( - makeCursorParameters(plan, outputBatchSize), - "SELECT c0, count(1), min(c1) filter (where c1 % 7 = 0), " - "max(c1) filter (where c1 % 11 = 0), sum(c1) filter (where c1 % 7 = 0) " - "FROM tmp GROUP BY 1"); + AssertQueryBuilder(plan, duckDbQueryRunner_) + .config( + core::QueryConfig::kPreferredOutputBatchRows, + std::to_string(outputBatchSize)) + .assertResults( + "SELECT c0, count(1), min(c1) filter (where c1 % 7 = 0), " + "max(c1) filter (where c1 % 11 = 0), sum(c1) filter (where c1 % 7 = 0) " + "FROM tmp GROUP BY 1"); } std::vector addPayload(const std::vector& keys) { @@ -168,12 +166,23 @@ class StreamingAggregationTest : public OperatorTestBase { keySql << ", c" << i; } - assertQuery( - makeCursorParameters(plan, outputBatchSize), - fmt::format( - "SELECT {}, count(1), min(c1), max(c1), sum(c1), sum(1) FROM tmp GROUP BY {}", - keySql.str(), - keySql.str())); + const auto sql = fmt::format( + "SELECT {}, count(1), min(c1), max(c1), sum(c1), sum(1) FROM tmp GROUP BY {}", + keySql.str(), + keySql.str()); + + AssertQueryBuilder(plan, duckDbQueryRunner_) + .config( + core::QueryConfig::kPreferredOutputBatchRows, + std::to_string(outputBatchSize)) + .assertResults(sql); + + EXPECT_EQ(NonPODInt64::constructed, NonPODInt64::destructed); + + // Force partial aggregation flush after every batch of input. + AssertQueryBuilder(plan, duckDbQueryRunner_) + .config(core::QueryConfig::kMaxPartialAggregationMemory, "0") + .assertResults(sql); EXPECT_EQ(NonPODInt64::constructed, NonPODInt64::destructed); } diff --git a/velox/exec/tests/TableScanTest.cpp b/velox/exec/tests/TableScanTest.cpp index e6df3e94b56c..78164dcdc3a8 100644 --- a/velox/exec/tests/TableScanTest.cpp +++ b/velox/exec/tests/TableScanTest.cpp @@ -2492,8 +2492,7 @@ TEST_F(TableScanTest, addSplitsToFailedTask) { } TEST_F(TableScanTest, errorInLoadLazy) { - auto cache = dynamic_cast( - memory::MemoryAllocator::getInstance()); + auto cache = cache::AsyncDataCache::getInstance(); VELOX_CHECK_NOT_NULL(cache); auto vectors = makeVectors(10, 1'000); auto filePath = TempFilePath::create(); diff --git a/velox/exec/tests/ThreadDebugInfoTest.cpp b/velox/exec/tests/ThreadDebugInfoTest.cpp index eea12376af1d..8721f5ee61ac 100644 --- a/velox/exec/tests/ThreadDebugInfoTest.cpp +++ b/velox/exec/tests/ThreadDebugInfoTest.cpp @@ -91,7 +91,7 @@ TEST_F(ThreadDebugInfoDeathTest, withinTheCallingThread) { executor_.get(), std::unordered_map{}, std::unordered_map>{}, - memory::MemoryAllocator::getInstance(), + cache::AsyncDataCache::getInstance(), nullptr, nullptr, "TaskCursorQuery_0"); diff --git a/velox/exec/tests/utils/AssertQueryBuilder.cpp b/velox/exec/tests/utils/AssertQueryBuilder.cpp index 74c52d72696b..0e3146eed0fb 100644 --- a/velox/exec/tests/utils/AssertQueryBuilder.cpp +++ b/velox/exec/tests/utils/AssertQueryBuilder.cpp @@ -216,7 +216,7 @@ AssertQueryBuilder::readCursor() { executor_.get(), std::unordered_map{}, std::unordered_map>{}, - memory::MemoryAllocator::getInstance(), + cache::AsyncDataCache::getInstance(), nullptr, nullptr, fmt::format("TaskCursorQuery_{}", cursorQueryId++)); diff --git a/velox/exec/tests/utils/Cursor.cpp b/velox/exec/tests/utils/Cursor.cpp index 59e0dad5f942..0495a15a3679 100644 --- a/velox/exec/tests/utils/Cursor.cpp +++ b/velox/exec/tests/utils/Cursor.cpp @@ -123,7 +123,7 @@ TaskCursor::TaskCursor(const CursorParameters& params) executor_.get(), std::unordered_map{}, std::unordered_map>{}, - memory::MemoryAllocator::getInstance(), + cache::AsyncDataCache::getInstance(), nullptr, nullptr, fmt::format("TaskCursorQuery_{}", cursorQueryId++)); diff --git a/velox/exec/tests/utils/OperatorTestBase.cpp b/velox/exec/tests/utils/OperatorTestBase.cpp index 8f378b70c361..a3f5e0ff4d06 100644 --- a/velox/exec/tests/utils/OperatorTestBase.cpp +++ b/velox/exec/tests/utils/OperatorTestBase.cpp @@ -33,9 +33,6 @@ using namespace facebook::velox::common::testutil; namespace facebook::velox::exec::test { -// static -std::shared_ptr OperatorTestBase::asyncDataCache_; - OperatorTestBase::OperatorTestBase() { using memory::MemoryAllocator; facebook::velox::exec::ExchangeSource::registerFactory(); @@ -51,29 +48,34 @@ OperatorTestBase::~OperatorTestBase() { memory::MemoryAllocator::setDefaultInstance(nullptr); } +void OperatorTestBase::SetUpTestCase() { + functions::prestosql::registerAllScalarFunctions(); + aggregate::prestosql::registerAllAggregateFunctions(); + TestValue::enable(); +} + void OperatorTestBase::TearDownTestCase() { Task::testingWaitForAllTasksToBeDeleted(); } void OperatorTestBase::SetUp() { - // Sets the process default MemoryAllocator to an async cache of up - // to 4GB backed by a default MemoryAllocator - if (!asyncDataCache_) { - asyncDataCache_ = std::make_shared( - memory::MemoryAllocator::createDefaultInstance(), 4UL << 30); - } - memory::MemoryAllocator::setDefaultInstance(asyncDataCache_.get()); if (!isRegisteredVectorSerde()) { this->registerVectorSerde(); } driverExecutor_ = std::make_unique(3); ioExecutor_ = std::make_unique(3); + allocator_ = memory::MemoryAllocator::createDefaultInstance(); + if (!asyncDataCache_) { + asyncDataCache_ = cache::AsyncDataCache::create(allocator_.get()); + cache::AsyncDataCache::setInstance(asyncDataCache_.get()); + } + memory::MemoryAllocator::setDefaultInstance(allocator_.get()); } -void OperatorTestBase::SetUpTestCase() { - functions::prestosql::registerAllScalarFunctions(); - aggregate::prestosql::registerAllAggregateFunctions(); - TestValue::enable(); +void OperatorTestBase::TearDown() { + if (asyncDataCache_ != nullptr) { + asyncDataCache_->prepareShutdown(); + } } std::shared_ptr OperatorTestBase::assertQuery( diff --git a/velox/exec/tests/utils/OperatorTestBase.h b/velox/exec/tests/utils/OperatorTestBase.h index e5a40d3f8f4e..7900b94f95aa 100644 --- a/velox/exec/tests/utils/OperatorTestBase.h +++ b/velox/exec/tests/utils/OperatorTestBase.h @@ -37,6 +37,8 @@ class OperatorTestBase : public testing::Test, void SetUp() override; + void TearDown() override; + /// Allow base classes to register custom vector serde. /// By default, registers Presto-compatible serde. virtual void registerVectorSerde(); @@ -139,8 +141,11 @@ class OperatorTestBase : public testing::Test, protected: DuckDbQueryRunner duckDbQueryRunner_; - // Used as default MappedMemory. Created on first use. - static std::shared_ptr asyncDataCache_; + // Used as default MemoryAllocator. + std::shared_ptr allocator_; + + // Used as default AsyncDataCache. + std::shared_ptr asyncDataCache_; // Used for driver thread execution. std::unique_ptr driverExecutor_; diff --git a/velox/exec/tests/utils/PlanBuilder.cpp b/velox/exec/tests/utils/PlanBuilder.cpp index 39c94faa9ef7..0f3ce0418805 100644 --- a/velox/exec/tests/utils/PlanBuilder.cpp +++ b/velox/exec/tests/utils/PlanBuilder.cpp @@ -848,9 +848,9 @@ PlanBuilder& PlanBuilder::partitionedOutput( : extract(planNode_->outputType(), outputLayout); planNode_ = std::make_shared( nextPlanNodeId(), + core::PartitionedOutputNode::Kind::kPartitioned, exprs(keys), numPartitions, - core::PartitionedOutputNode::Kind::kPartitioned, replicateNullsAndAny, std::move(partitionFunctionSpec), outputType, @@ -1424,9 +1424,18 @@ PlanBuilder& PlanBuilder::window( PlanBuilder& PlanBuilder::rowNumber( const std::vector& partitionKeys, - std::optional limit) { + std::optional limit, + const bool generateRowNumber) { + std::optional rowNumberColumnName; + if (generateRowNumber) { + rowNumberColumnName = "row_number"; + } planNode_ = std::make_shared( - nextPlanNodeId(), fields(partitionKeys), "row_number", limit, planNode_); + nextPlanNodeId(), + fields(partitionKeys), + rowNumberColumnName, + limit, + planNode_); return *this; } diff --git a/velox/exec/tests/utils/PlanBuilder.h b/velox/exec/tests/utils/PlanBuilder.h index b12b812ca3d1..99b347799f80 100644 --- a/velox/exec/tests/utils/PlanBuilder.h +++ b/velox/exec/tests/utils/PlanBuilder.h @@ -722,7 +722,8 @@ class PlanBuilder { /// optional limit and no sorting. PlanBuilder& rowNumber( const std::vector& partitionKeys, - std::optional limit = std::nullopt); + std::optional limit = std::nullopt, + bool generateRowNumber = true); /// Add a TopNRowNumberNode to compute single row_number window function with /// a limit applied to sorted partitions. diff --git a/velox/exec/tests/utils/RowContainerTestBase.h b/velox/exec/tests/utils/RowContainerTestBase.h index 1b6f5940060a..2d70c5fbf6cf 100644 --- a/velox/exec/tests/utils/RowContainerTestBase.h +++ b/velox/exec/tests/utils/RowContainerTestBase.h @@ -54,7 +54,7 @@ class RowContainerTestBase : public testing::Test, const std::vector& keyTypes, const std::vector& dependentTypes, bool isJoinBuild = true) { - return std::make_unique( + auto container = std::make_unique( keyTypes, !isJoinBuild, std::vector{}, @@ -64,6 +64,8 @@ class RowContainerTestBase : public testing::Test, true, true, pool_.get()); + VELOX_CHECK(container->testingMutable()); + return container; } }; } // namespace facebook::velox::exec::test diff --git a/velox/expression/CastExpr-inl.h b/velox/expression/CastExpr-inl.h new file mode 100644 index 000000000000..6883aee47a0b --- /dev/null +++ b/velox/expression/CastExpr-inl.h @@ -0,0 +1,322 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include "velox/common/base/Exceptions.h" +#include "velox/core/CoreTypeSystem.h" +#include "velox/expression/StringWriter.h" +#include "velox/external/date/tz.h" +#include "velox/type/Type.h" +#include "velox/vector/SelectivityVector.h" + +namespace facebook::velox::exec { +namespace { + +inline std::string makeErrorMessage( + const BaseVector& input, + vector_size_t row, + const TypePtr& toType, + const std::string& details = "") { + return fmt::format( + "Failed to cast from {} to {}: {}. {}", + input.type()->toString(), + toType->toString(), + input.toString(row), + details); +} + +inline std::exception_ptr makeBadCastException( + const TypePtr& resultType, + const BaseVector& input, + vector_size_t row, + const std::string& errorDetails) { + return std::make_exception_ptr(VeloxUserError( + std::current_exception(), + makeErrorMessage(input, row, resultType, errorDetails), + false)); +}; + +} // namespace + +template +void CastExpr::castTimestampToDate( + const SelectivityVector& rows, + const BaseVector& input, + exec::EvalCtx& context, + VectorPtr& result, + const date::time_zone* timeZone) { + auto* resultFlatVector = result->as>(); + static const int32_t kSecsPerDay{86'400}; + auto inputVector = input.as>(); + applyToSelectedNoThrowLocal(context, rows, result, [&](int row) { + auto input = inputVector->valueAt(row); + if constexpr (adjustForTimeZone) { + input.toTimezone(*timeZone); + } + auto seconds = input.getSeconds(); + if (seconds >= 0 || seconds % kSecsPerDay == 0) { + resultFlatVector->set(row, seconds / kSecsPerDay); + } else { + // For division with negatives, minus 1 to compensate the discarded + // fractional part. e.g. -1/86'400 yields 0, yet it should be + // considered as -1 day. + resultFlatVector->set(row, seconds / kSecsPerDay - 1); + } + }); +} + +template +void CastExpr::applyToSelectedNoThrowLocal( + EvalCtx& context, + const SelectivityVector& rows, + VectorPtr& result, + Func&& func) { + if (setNullInResultAtError()) { + rows.template applyToSelected([&](auto row) INLINE_LAMBDA { + try { + func(row); + } catch (...) { + result->setNull(row, true); + } + }); + } else { + rows.template applyToSelected([&](auto row) INLINE_LAMBDA { + try { + func(row); + } catch (const VeloxException& e) { + // Avoid double throwing. + context.setVeloxExceptionError(row, std::current_exception()); + } catch (const std::exception& e) { + context.setError(row, std::current_exception()); + } + }); + } +} + +/// The per-row level Kernel +/// @tparam ToKind The cast target type +/// @tparam FromKind The expression type +/// @param row The index of the current row +/// @param input The input vector (of type FromKind) +/// @param result The output vector (of type ToKind) +template +void CastExpr::applyCastKernel( + vector_size_t row, + EvalCtx& context, + const SimpleVector::NativeType>* input, + FlatVector::NativeType>* result) { + auto inputRowValue = input->valueAt(row); + + // Optimize empty input strings casting by avoiding throwing exceptions. + if constexpr ( + FromKind == TypeKind::VARCHAR || FromKind == TypeKind::VARBINARY) { + if constexpr ( + TypeTraits::isPrimitiveType && + TypeTraits::isFixedWidth) { + if (inputRowValue.size() == 0) { + if (setNullInResultAtError()) { + result->setNull(row, true); + } else { + context.setVeloxExceptionError( + row, + makeBadCastException( + result->type(), *input, row, "Empty string")); + } + return; + } + } + } + + auto output = util::Converter::cast(inputRowValue); + + if constexpr (ToKind == TypeKind::VARCHAR || ToKind == TypeKind::VARBINARY) { + // Write the result output to the output vector + auto writer = exec::StringWriter<>(result, row); + writer.copy_from(output); + writer.finalize(); + } else { + result->set(row, output); + } +} + +template +void CastExpr::applyDecimalCastKernel( + const SelectivityVector& rows, + const BaseVector& input, + exec::EvalCtx& context, + const TypePtr& fromType, + const TypePtr& toType, + VectorPtr& castResult) { + auto sourceVector = input.as>(); + auto castResultRawBuffer = + castResult->asUnchecked>()->mutableRawValues(); + const auto& fromPrecisionScale = getDecimalPrecisionScale(*fromType); + const auto& toPrecisionScale = getDecimalPrecisionScale(*toType); + + applyToSelectedNoThrowLocal( + context, rows, castResult, [&](vector_size_t row) { + auto rescaledValue = DecimalUtil::rescaleWithRoundUp( + sourceVector->valueAt(row), + fromPrecisionScale.first, + fromPrecisionScale.second, + toPrecisionScale.first, + toPrecisionScale.second); + if (rescaledValue.has_value()) { + castResultRawBuffer[row] = rescaledValue.value(); + } else { + castResult->setNull(row, true); + } + }); +} + +template +void CastExpr::applyIntToDecimalCastKernel( + const SelectivityVector& rows, + const BaseVector& input, + exec::EvalCtx& context, + const TypePtr& toType, + VectorPtr& castResult) { + auto sourceVector = input.as>(); + auto castResultRawBuffer = + castResult->asUnchecked>()->mutableRawValues(); + const auto& toPrecisionScale = getDecimalPrecisionScale(*toType); + applyToSelectedNoThrowLocal( + context, rows, castResult, [&](vector_size_t row) { + auto rescaledValue = DecimalUtil::rescaleInt( + sourceVector->valueAt(row), + toPrecisionScale.first, + toPrecisionScale.second); + if (rescaledValue.has_value()) { + castResultRawBuffer[row] = rescaledValue.value(); + } else { + castResult->setNull(row, true); + } + }); +} + +template +VectorPtr CastExpr::applyDecimalToDoubleCast( + const SelectivityVector& rows, + const BaseVector& input, + exec::EvalCtx& context, + const TypePtr& fromType) { + VectorPtr result; + context.ensureWritable(rows, DOUBLE(), result); + (*result).clearNulls(rows); + auto resultBuffer = + result->asUnchecked>()->mutableRawValues(); + const auto precisionScale = getDecimalPrecisionScale(*fromType); + const auto simpleInput = input.as>(); + applyToSelectedNoThrowLocal(context, rows, result, [&](int row) { + auto output = util::Converter::cast( + simpleInput->valueAt(row)); + resultBuffer[row] = + output / DecimalUtil::kPowersOfTen[precisionScale.second]; + }); + + return result; +} + +template +void CastExpr::applyCastPrimitives( + const SelectivityVector& rows, + exec::EvalCtx& context, + const BaseVector& input, + VectorPtr& result) { + using To = typename TypeTraits::NativeType; + using From = typename TypeTraits::NativeType; + auto* resultFlatVector = result->as>(); + auto* inputSimpleVector = input.as>(); + + const auto& queryConfig = context.execCtx()->queryCtx()->queryConfig(); + auto& resultType = resultFlatVector->type(); + + auto setError = [&](vector_size_t row, const std::string& details) { + if (setNullInResultAtError()) { + result->setNull(row, true); + } else { + context.setVeloxExceptionError( + row, makeBadCastException(resultType, input, row, details)); + } + }; + + if (!queryConfig.isCastToIntByTruncate()) { + applyToSelectedNoThrowLocal(context, rows, result, [&](int row) { + try { + applyCastKernel( + row, context, inputSimpleVector, resultFlatVector); + + } catch (const VeloxUserError& ue) { + setError(row, ue.message()); + } catch (const std::exception& e) { + setError(row, e.what()); + } + }); + + } else { + applyToSelectedNoThrowLocal(context, rows, result, [&](int row) { + try { + applyCastKernel( + row, context, inputSimpleVector, resultFlatVector); + } catch (const VeloxUserError& ue) { + setError(row, ue.message()); + } catch (const std::exception& e) { + setError(row, e.what()); + } + }); + } + + // If we're converting to a TIMESTAMP, check if we need to adjust the + // current GMT timezone to the user provided session timezone. + if constexpr (ToKind == TypeKind::TIMESTAMP) { + // If user explicitly asked us to adjust the timezone. + if (queryConfig.adjustTimestampToTimezone()) { + auto sessionTzName = queryConfig.sessionTimezone(); + if (!sessionTzName.empty()) { + // locate_zone throws runtime_error if the timezone couldn't be found + // (so we're safe to dereference the pointer). + auto* timeZone = date::locate_zone(sessionTzName); + auto rawTimestamps = resultFlatVector->mutableRawValues(); + + applyToSelectedNoThrowLocal(context, rows, result, [&](int row) { + rawTimestamps[row].toGMT(*timeZone); + }); + } + } + } +} + +template +void CastExpr::applyCastPrimitivesDispatch( + const TypePtr& fromType, + const TypePtr& toType, + const SelectivityVector& rows, + exec::EvalCtx& context, + const BaseVector& input, + VectorPtr& result) { + context.ensureWritable(rows, toType, result); + + // This already excludes complex types, hugeint and unknown from type kinds. + VELOX_DYNAMIC_SCALAR_TEMPLATE_TYPE_DISPATCH( + applyCastPrimitives, + ToKind, + fromType->kind() /*dispatched*/, + rows, + context, + input, + result); +} + +} // namespace facebook::velox::exec diff --git a/velox/expression/CastExpr.cpp b/velox/expression/CastExpr.cpp index 2ff3ceb5b187..68ad19ab4206 100644 --- a/velox/expression/CastExpr.cpp +++ b/velox/expression/CastExpr.cpp @@ -16,15 +16,13 @@ #include "velox/expression/CastExpr.h" -#include - #include +#include -#include #include "velox/common/base/Exceptions.h" #include "velox/core/CoreTypeSystem.h" #include "velox/expression/PeeledEncoding.h" -#include "velox/expression/StringWriter.h" +#include "velox/expression/ScopedVarSetter.h" #include "velox/external/date/tz.h" #include "velox/functions/lib/RowsTranslationUtil.h" #include "velox/type/Type.h" @@ -34,74 +32,7 @@ namespace facebook::velox::exec { -namespace { - -std::string makeErrorMessage( - const BaseVector& input, - vector_size_t row, - const TypePtr& toType, - const std::string& details = "") { - return fmt::format( - "Failed to cast from {} to {}: {}. {}", - input.type()->toString(), - toType->toString(), - input.toString(row), - details); -} - -std::exception_ptr makeBadCastException( - const TypePtr& resultType, - const BaseVector& input, - vector_size_t row, - const std::string& errorDetails) { - return std::make_exception_ptr(VeloxUserError( - std::current_exception(), - makeErrorMessage(input, row, resultType, errorDetails), - false)); -}; - -/// The per-row level Kernel -/// @tparam ToKind The cast target type -/// @tparam FromKind The expression type -/// @param row The index of the current row -/// @param input The input vector (of type FromKind) -/// @param result The output vector (of type ToKind) -template -void applyCastKernel( - vector_size_t row, - EvalCtx& context, - const SimpleVector::NativeType>* input, - FlatVector::NativeType>* result) { - auto inputRowValue = input->valueAt(row); - - // Optimize empty input strings casting by avoiding throwing exceptions. - if constexpr ( - FromKind == TypeKind::VARCHAR || FromKind == TypeKind::VARBINARY) { - if constexpr ( - TypeTraits::isPrimitiveType && - TypeTraits::isFixedWidth) { - if (inputRowValue.size() == 0) { - context.setVeloxExceptionError( - row, - makeBadCastException(result->type(), *input, row, "Empty string")); - return; - } - } - } - - auto output = util::Converter::cast(inputRowValue); - - if constexpr (ToKind == TypeKind::VARCHAR || ToKind == TypeKind::VARBINARY) { - // Write the result output to the output vector - auto writer = exec::StringWriter<>(result, row); - writer.copy_from(output); - writer.finalize(); - } else { - result->set(row, output); - } -} - -VectorPtr castFromDate( +VectorPtr CastExpr::castFromDate( const SelectivityVector& rows, const BaseVector& input, exec::EvalCtx& context, @@ -114,7 +45,7 @@ VectorPtr castFromDate( switch (toType->kind()) { case TypeKind::VARCHAR: { auto* resultFlatVector = castResult->as>(); - context.applyToSelectedNoThrow(rows, [&](int row) { + applyToSelectedNoThrowLocal(context, rows, castResult, [&](int row) { try { auto output = DATE()->toString(inputFlatVector->valueAt(row)); auto writer = exec::StringWriter<>(resultFlatVector, row); @@ -134,12 +65,13 @@ VectorPtr castFromDate( case TypeKind::TIMESTAMP: { static const int64_t kMillisPerDay{86'400'000}; auto* resultFlatVector = castResult->as>(); - context.applyToSelectedNoThrow(rows, [&](int row) { + applyToSelectedNoThrowLocal(context, rows, castResult, [&](int row) { resultFlatVector->set( row, Timestamp::fromMillis( inputFlatVector->valueAt(row) * kMillisPerDay)); }); + return castResult; } default: @@ -148,33 +80,7 @@ VectorPtr castFromDate( } } -template -void castTimestampToDate( - const SelectivityVector& rows, - const BaseVector& input, - exec::EvalCtx& context, - FlatVector* resultFlatVector, - const date::time_zone* timeZone = nullptr) { - static const int32_t kSecsPerDay{86'400}; - auto inputVector = input.as>(); - context.applyToSelectedNoThrow(rows, [&](int row) { - auto input = inputVector->valueAt(row); - if constexpr (adjustForTimeZone) { - input.toTimezone(*timeZone); - } - auto seconds = input.getSeconds(); - if (seconds >= 0 || seconds % kSecsPerDay == 0) { - resultFlatVector->set(row, seconds / kSecsPerDay); - } else { - // For division with negatives, minus 1 to compensate the discarded - // fractional part. e.g. -1/86'400 yields 0, yet it should be - // considered as -1 day. - resultFlatVector->set(row, seconds / kSecsPerDay - 1); - } - }); -} - -VectorPtr castToDate( +VectorPtr CastExpr::castToDate( const SelectivityVector& rows, const BaseVector& input, exec::EvalCtx& context, @@ -186,7 +92,7 @@ VectorPtr castToDate( switch (fromType->kind()) { case TypeKind::VARCHAR: { auto* inputVector = input.as>(); - context.applyToSelectedNoThrow(rows, [&](int row) { + applyToSelectedNoThrowLocal(context, rows, castResult, [&](int row) { try { auto inputString = inputVector->valueAt(row); resultFlatVector->set(row, DATE()->toDays(inputString)); @@ -198,6 +104,7 @@ VectorPtr castToDate( makeErrorMessage(input, row, DATE()) + " " + e.what()); } }); + return castResult; } case TypeKind::TIMESTAMP: { @@ -205,10 +112,9 @@ VectorPtr castToDate( auto sessionTzName = queryConfig.sessionTimezone(); if (queryConfig.adjustTimestampToTimezone() && !sessionTzName.empty()) { auto* timeZone = date::locate_zone(sessionTzName); - castTimestampToDate( - rows, input, context, resultFlatVector, timeZone); + castTimestampToDate(rows, input, context, castResult, timeZone); } else { - castTimestampToDate(rows, input, context, resultFlatVector); + castTimestampToDate(rows, input, context, castResult); } return castResult; } @@ -218,169 +124,26 @@ VectorPtr castToDate( } } -template -void applyDecimalCastKernel( - const SelectivityVector& rows, - const BaseVector& input, - exec::EvalCtx& context, - const TypePtr& fromType, - const TypePtr& toType, - VectorPtr& castResult) { - auto sourceVector = input.as>(); - auto castResultRawBuffer = - castResult->asUnchecked>()->mutableRawValues(); - const auto& fromPrecisionScale = getDecimalPrecisionScale(*fromType); - const auto& toPrecisionScale = getDecimalPrecisionScale(*toType); - context.applyToSelectedNoThrow(rows, [&](vector_size_t row) { - auto rescaledValue = DecimalUtil::rescaleWithRoundUp( - sourceVector->valueAt(row), - fromPrecisionScale.first, - fromPrecisionScale.second, - toPrecisionScale.first, - toPrecisionScale.second); - if (rescaledValue.has_value()) { - castResultRawBuffer[row] = rescaledValue.value(); - } else { - castResult->setNull(row, true); - } - }); -} - -template -void applyIntToDecimalCastKernel( - const SelectivityVector& rows, - const BaseVector& input, - exec::EvalCtx& context, - const TypePtr& toType, - VectorPtr& castResult) { - auto sourceVector = input.as>(); - auto castResultRawBuffer = - castResult->asUnchecked>()->mutableRawValues(); - const auto& toPrecisionScale = getDecimalPrecisionScale(*toType); - context.applyToSelectedNoThrow(rows, [&](vector_size_t row) { - auto rescaledValue = DecimalUtil::rescaleInt( - sourceVector->valueAt(row), - toPrecisionScale.first, - toPrecisionScale.second); - if (rescaledValue.has_value()) { - castResultRawBuffer[row] = rescaledValue.value(); +namespace { +void propagateErrorsOrSetNulls( + bool setNullInResultAtError, + EvalCtx& context, + const SelectivityVector& nestedRows, + const BufferPtr& elementToTopLevelRows, + VectorPtr& result, + ErrorVectorPtr& oldErrors) { + if (context.errors()) { + if (setNullInResultAtError) { + // Errors in context.errors() should be translated to nulls in the top + // level rows. + context.convertElementErrorsToTopLevelNulls( + nestedRows, elementToTopLevelRows, result); } else { - castResult->setNull(row, true); - } - }); -} - -template -VectorPtr applyDecimalToDoubleCast( - const SelectivityVector& rows, - const BaseVector& input, - exec::EvalCtx& context, - const TypePtr& fromType) { - VectorPtr result; - context.ensureWritable(rows, DOUBLE(), result); - (*result).clearNulls(rows); - auto resultBuffer = - result->asUnchecked>()->mutableRawValues(); - const auto precisionScale = getDecimalPrecisionScale(*fromType); - const auto simpleInput = input.as>(); - context.applyToSelectedNoThrow(rows, [&](int row) { - auto output = util::Converter::cast( - simpleInput->valueAt(row)); - resultBuffer[row] = - output / DecimalUtil::kPowersOfTen[precisionScale.second]; - }); - return result; -} - -template -void applyCastPrimitives( - const SelectivityVector& rows, - exec::EvalCtx& context, - const BaseVector& input, - VectorPtr& result) { - using To = typename TypeTraits::NativeType; - using From = typename TypeTraits::NativeType; - auto* resultFlatVector = result->as>(); - auto* inputSimpleVector = input.as>(); - - const auto& queryConfig = context.execCtx()->queryCtx()->queryConfig(); - auto& resultType = resultFlatVector->type(); - - auto setVeloxError = [&](vector_size_t row, const std::string& details) { - context.setVeloxExceptionError( - row, makeBadCastException(resultType, input, row, details)); - }; - - auto setError = [&](vector_size_t row, const std::string& details) { - context.setError( - row, makeBadCastException(resultType, input, row, details)); - }; - - if (!queryConfig.isCastToIntByTruncate()) { - context.applyToSelectedNoThrow(rows, [&](int row) { - try { - applyCastKernel( - row, context, inputSimpleVector, resultFlatVector); - - } catch (const VeloxUserError& ue) { - setVeloxError(row, ue.message()); - } catch (const std::exception& e) { - setError(row, e.what()); - } - }); - } else { - context.applyToSelectedNoThrow(rows, [&](int row) { - try { - applyCastKernel( - row, context, inputSimpleVector, resultFlatVector); - } catch (const VeloxUserError& ue) { - setVeloxError(row, ue.message()); - } catch (const std::exception& e) { - setError(row, e.what()); - } - }); - } - - // If we're converting to a TIMESTAMP, check if we need to adjust the - // current GMT timezone to the user provided session timezone. - if constexpr (ToKind == TypeKind::TIMESTAMP) { - // If user explicitly asked us to adjust the timezone. - if (queryConfig.adjustTimestampToTimezone()) { - auto sessionTzName = queryConfig.sessionTimezone(); - if (!sessionTzName.empty()) { - // locate_zone throws runtime_error if the timezone couldn't be found - // (so we're safe to dereference the pointer). - auto* timeZone = date::locate_zone(sessionTzName); - auto rawTimestamps = resultFlatVector->mutableRawValues(); - - rows.applyToSelected( - [&](int row) { rawTimestamps[row].toGMT(*timeZone); }); - } + context.addElementErrorsToTopLevel( + nestedRows, elementToTopLevelRows, oldErrors); } } } - -template -void applyCastPrimitivesDispatch( - const TypePtr& fromType, - const TypePtr& toType, - const SelectivityVector& rows, - exec::EvalCtx& context, - const BaseVector& input, - VectorPtr& result) { - context.ensureWritable(rows, toType, result); - - // This already excludes complex types, hugeint and unknown from type kinds. - VELOX_DYNAMIC_SCALAR_TEMPLATE_TYPE_DISPATCH( - applyCastPrimitives, - ToKind, - fromType->kind() /*dispatched*/, - rows, - context, - input, - result); -} - } // namespace VectorPtr CastExpr::applyMap( @@ -413,13 +176,16 @@ VectorPtr CastExpr::applyMap( if (fromType.keyType() == toType.keyType()) { newMapKeys = input->mapKeys(); } else { - apply( - nestedRows, - mapKeys, - context, - fromType.keyType(), - toType.keyType(), - newMapKeys); + { + ScopedVarSetter holder(&inTopLevel, false); + apply( + nestedRows, + mapKeys, + context, + fromType.keyType(), + toType.keyType(), + newMapKeys); + } } // Cast values @@ -427,19 +193,18 @@ VectorPtr CastExpr::applyMap( if (fromType.valueType() == toType.valueType()) { newMapValues = mapValues; } else { - apply( - nestedRows, - mapValues, - context, - fromType.valueType(), - toType.valueType(), - newMapValues); + { + ScopedVarSetter holder(&inTopLevel, false); + apply( + nestedRows, + mapValues, + context, + fromType.valueType(), + toType.valueType(), + newMapValues); + } } - context.addElementErrorsToTopLevel( - nestedRows, elementToTopLevelRows, oldErrors); - context.swapErrors(oldErrors); - // Returned map vector should be addressable for every element, even those // that are not selected. BufferPtr sizes = input->sizes(); @@ -447,7 +212,6 @@ VectorPtr CastExpr::applyMap( // We extends size since that is cheap. newMapKeys->resize(input->mapKeys()->size()); newMapValues->resize(input->mapValues()->size()); - } else if ( newMapKeys->size() < input->mapKeys()->size() || newMapValues->size() < input->mapValues()->size()) { @@ -455,12 +219,13 @@ VectorPtr CastExpr::applyMap( AlignedBuffer::allocate(rows.end(), context.pool(), 0); auto* inputSizes = input->rawSizes(); auto* rawSizes = sizes->asMutable(); + rows.applyToSelected( [&](vector_size_t row) { rawSizes[row] = inputSizes[row]; }); } // Assemble the output map - return std::make_shared( + VectorPtr result = std::make_shared( context.pool(), MAP(toType.keyType(), toType.valueType()), input->nulls(), @@ -469,6 +234,18 @@ VectorPtr CastExpr::applyMap( sizes, newMapKeys, newMapValues); + + propagateErrorsOrSetNulls( + setNullInResultAtError(), + context, + nestedRows, + elementToTopLevelRows, + result, + oldErrors); + + // Restore original state. + context.swapErrors(oldErrors); + return result; } VectorPtr CastExpr::applyArray( @@ -490,19 +267,16 @@ VectorPtr CastExpr::applyArray( context.swapErrors(oldErrors); VectorPtr newElements; - apply( - nestedRows, - arrayElements, - context, - fromType.elementType(), - toType.elementType(), - newElements); - - if (context.errors()) { - context.addElementErrorsToTopLevel( - nestedRows, elementToTopLevelRows, oldErrors); + { + ScopedVarSetter holder(&inTopLevel, false); + apply( + nestedRows, + arrayElements, + context, + fromType.elementType(), + toType.elementType(), + newElements); } - context.swapErrors(oldErrors); // Returned array vector should be addressable for every element, even those // that are not selected. @@ -519,7 +293,7 @@ VectorPtr CastExpr::applyArray( [&](vector_size_t row) { rawSizes[row] = inputSizes[row]; }); } - return std::make_shared( + VectorPtr result = std::make_shared( context.pool(), ARRAY(toType.elementType()), input->nulls(), @@ -527,6 +301,17 @@ VectorPtr CastExpr::applyArray( input->offsets(), sizes, newElements); + + propagateErrorsOrSetNulls( + setNullInResultAtError(), + context, + nestedRows, + elementToTopLevelRows, + result, + oldErrors); + // Restore original state. + context.swapErrors(oldErrors); + return result; } VectorPtr CastExpr::applyRow( @@ -548,6 +333,13 @@ VectorPtr CastExpr::applyRow( std::vector newChildren; newChildren.reserve(numOutputChildren); + ErrorVectorPtr oldErrors; + if (setNullInResultAtError()) { + // We need to isolate errors that happen during the cast from previous + // errors since those translate to nulls, unlike exisiting errors. + context.swapErrors(oldErrors); + } + for (auto toChildrenIndex = 0; toChildrenIndex < numOutputChildren; toChildrenIndex++) { // For each child, find the corresponding column index in the output @@ -584,7 +376,8 @@ VectorPtr CastExpr::applyRow( if (toChildType == inputChild->type()) { outputChild = inputChild; } else { - // Apply cast for the child + // Apply cast for the child. + ScopedVarSetter holder(&inTopLevel, false); apply( rows, inputChild, @@ -598,12 +391,27 @@ VectorPtr CastExpr::applyRow( } // Assemble the output row - return std::make_shared( + VectorPtr result = std::make_shared( context.pool(), toType, input->nulls(), rows.end(), std::move(newChildren)); + + if (setNullInResultAtError()) { + // Set errors as nulls. + if (auto errors = context.errors()) { + rows.applyToSelected([&](auto row) { + if (errors->isIndexInRange(row) && !errors->isNullAt(row)) { + result->setNull(row, true); + } + }); + } + // Restore original state. + context.swapErrors(oldErrors); + } + + return result; } template @@ -797,14 +605,20 @@ void CastExpr::evalSpecialForm( auto fromType = inputs_[0]->type(); auto toType = std::const_pointer_cast(type_); - apply(rows, input, context, fromType, toType, result); + inTopLevel = true; + if (nullOnFailure()) { + ScopedVarSetter holder{context.mutableThrowOnError(), false}; + apply(rows, input, context, fromType, toType, result); + } else { + apply(rows, input, context, fromType, toType, result); + } // Return 'input' back to the vector pool in 'context' so it can be reused. context.releaseVector(input); } std::string CastExpr::toString(bool recursive) const { std::stringstream out; - out << "cast("; + out << name() << "("; if (recursive) { appendInputs(out); } else { @@ -816,7 +630,7 @@ std::string CastExpr::toString(bool recursive) const { std::string CastExpr::toSql(std::vector* complexConstants) const { std::stringstream out; - out << "cast("; + out << name() << "("; appendInputsSql(out, complexConstants); out << " as "; toTypeSql(type_, out); @@ -839,6 +653,24 @@ ExprPtr CastCallToSpecialForm::constructSpecialForm( "CAST statements expect exactly 1 argument, received {}", compiledChildren.size()); return std::make_shared( - type, std::move(compiledChildren[0]), trackCpuUsage); + type, std::move(compiledChildren[0]), trackCpuUsage, false); +} + +TypePtr TryCastCallToSpecialForm::resolveType( + const std::vector& /* argTypes */) { + VELOX_FAIL("TRY CAST expressions do not support type resolution."); +} + +ExprPtr TryCastCallToSpecialForm::constructSpecialForm( + const TypePtr& type, + std::vector&& compiledChildren, + bool trackCpuUsage) { + VELOX_CHECK_EQ( + compiledChildren.size(), + 1, + "TRY CAST statements expect exactly 1 argument, received {}", + compiledChildren.size()); + return std::make_shared( + type, std::move(compiledChildren[0]), trackCpuUsage, true); } } // namespace facebook::velox::exec diff --git a/velox/expression/CastExpr.h b/velox/expression/CastExpr.h index 7e701f9c19ec..3be65997169a 100644 --- a/velox/expression/CastExpr.h +++ b/velox/expression/CastExpr.h @@ -22,6 +22,7 @@ namespace facebook::velox::exec { constexpr folly::StringPiece kCast = "cast"; +constexpr folly::StringPiece kTryCast = "try_cast"; /// Custom operator for casts from and to custom types. class CastOperator { @@ -71,13 +72,14 @@ class CastExpr : public SpecialForm { /// @param type The target type of the cast expression /// @param expr The expression to cast /// @param trackCpuUsage Whether to track CPU usage - CastExpr(TypePtr type, ExprPtr&& expr, bool trackCpuUsage) + CastExpr(TypePtr type, ExprPtr&& expr, bool trackCpuUsage, bool nullOnFailure) : SpecialForm( type, std::vector({expr}), - kCast.data(), + nullOnFailure ? kTryCast.data() : kCast.data(), false /* supportsFlatNoNullsFastPath */, - trackCpuUsage) { + trackCpuUsage), + nullOnFailure_(nullOnFailure) { auto fromType = inputs_[0]->type(); castFromOperator_ = getCustomTypeCastOperator(fromType->toString()); if (castFromOperator_ && !castFromOperator_->isSupportedToType(type)) { @@ -160,6 +162,94 @@ class CastExpr : public SpecialForm { const TypePtr& toType, VectorPtr& result); + template + void applyToSelectedNoThrowLocal( + EvalCtx& context, + const SelectivityVector& rows, + VectorPtr& result, + Func&& func); + + /// The per-row level Kernel + /// @tparam ToKind The cast target type + /// @tparam FromKind The expression type + /// @param row The index of the current row + /// @param input The input vector (of type FromKind) + /// @param result The output vector (of type ToKind) + template + void applyCastKernel( + vector_size_t row, + EvalCtx& context, + const SimpleVector::NativeType>* input, + FlatVector::NativeType>* result); + + VectorPtr castFromDate( + const SelectivityVector& rows, + const BaseVector& input, + exec::EvalCtx& context, + const TypePtr& toType); + + VectorPtr castToDate( + const SelectivityVector& rows, + const BaseVector& input, + exec::EvalCtx& context, + const TypePtr& fromType); + + template + void applyDecimalCastKernel( + const SelectivityVector& rows, + const BaseVector& input, + exec::EvalCtx& context, + const TypePtr& fromType, + const TypePtr& toType, + VectorPtr& castResult); + + template + void applyIntToDecimalCastKernel( + const SelectivityVector& rows, + const BaseVector& input, + exec::EvalCtx& context, + const TypePtr& toType, + VectorPtr& castResult); + + template + VectorPtr applyDecimalToDoubleCast( + const SelectivityVector& rows, + const BaseVector& input, + exec::EvalCtx& context, + const TypePtr& fromType); + + template + void applyCastPrimitives( + const SelectivityVector& rows, + exec::EvalCtx& context, + const BaseVector& input, + VectorPtr& result); + + template + void applyCastPrimitivesDispatch( + const TypePtr& fromType, + const TypePtr& toType, + const SelectivityVector& rows, + exec::EvalCtx& context, + const BaseVector& input, + VectorPtr& result); + + template + void castTimestampToDate( + const SelectivityVector& rows, + const BaseVector& input, + exec::EvalCtx& context, + VectorPtr& result, + const date::time_zone* timeZone = nullptr); + + bool nullOnFailure() const { + return nullOnFailure_; + } + + bool setNullInResultAtError() const { + return nullOnFailure() && inTopLevel; + } + // Custom cast operator for the from-type. Nullptr if the type is native or // doesn't support cast-from. CastOperatorPtr castFromOperator_; @@ -167,6 +257,10 @@ class CastExpr : public SpecialForm { // Custom cast operator for the to-type. Nullptr if the type is native or // doesn't support cast-to. CastOperatorPtr castToOperator_; + + bool nullOnFailure_; + + bool inTopLevel = false; }; class CastCallToSpecialForm : public FunctionCallToSpecialForm { @@ -179,4 +273,15 @@ class CastCallToSpecialForm : public FunctionCallToSpecialForm { bool trackCpuUsage) override; }; +class TryCastCallToSpecialForm : public FunctionCallToSpecialForm { + public: + TypePtr resolveType(const std::vector& argTypes) override; + + ExprPtr constructSpecialForm( + const TypePtr& type, + std::vector&& compiledChildren, + bool trackCpuUsage) override; +}; } // namespace facebook::velox::exec + +#include "velox/expression/CastExpr-inl.h" diff --git a/velox/expression/EvalCtx.cpp b/velox/expression/EvalCtx.cpp index 12119256fa97..cefdf3f065e2 100644 --- a/velox/expression/EvalCtx.cpp +++ b/velox/expression/EvalCtx.cpp @@ -211,6 +211,23 @@ void EvalCtx::addElementErrorsToTopLevel( }); } +void EvalCtx::convertElementErrorsToTopLevelNulls( + const SelectivityVector& elementRows, + const BufferPtr& elementToTopLevelRows, + VectorPtr& result) { + if (!errors_) { + return; + } + + const auto* rawElementToTopLevelRows = + elementToTopLevelRows->as(); + elementRows.applyToSelected([&](auto row) { + if (errors_->isIndexInRange(row) && !errors_->isNullAt(row)) { + result->setNull(rawElementToTopLevelRows[row], true); + } + }); +} + const VectorPtr& EvalCtx::getField(int32_t index) const { const VectorPtr* field; if (!peeledFields_.empty()) { diff --git a/velox/expression/EvalCtx.h b/velox/expression/EvalCtx.h index 3e82f4c97eaf..ce9506b1a6ca 100644 --- a/velox/expression/EvalCtx.h +++ b/velox/expression/EvalCtx.h @@ -134,6 +134,13 @@ class EvalCtx { const BufferPtr& elementToTopLevelRows, ErrorVectorPtr& topLevelErrors); + // Given a mapping from element rows to top-level rows, set errors in + // in the elements as nulls int the top level row. + void convertElementErrorsToTopLevelNulls( + const SelectivityVector& elementRows, + const BufferPtr& elementToTopLevelRows, + VectorPtr& result); + void deselectErrors(SelectivityVector& rows) const { if (!errors_) { return; diff --git a/velox/expression/ExprCompiler.cpp b/velox/expression/ExprCompiler.cpp index 94ce09fe3f20..9a9cff4ae9fb 100644 --- a/velox/expression/ExprCompiler.cpp +++ b/velox/expression/ExprCompiler.cpp @@ -412,13 +412,11 @@ ExprPtr compileRewrittenExpression( } else if (auto cast = dynamic_cast(expr.get())) { VELOX_CHECK(!compiledInputs.empty()); auto castExpr = std::make_shared( - resultType, std::move(compiledInputs[0]), trackCpuUsage); - if (cast->nullOnFailure()) { - result = - getSpecialForm(config, "try", resultType, {castExpr}, trackCpuUsage); - } else { - result = castExpr; - } + resultType, + std::move(compiledInputs[0]), + trackCpuUsage, + cast->nullOnFailure()); + result = castExpr; } else if (auto call = dynamic_cast(expr.get())) { if (auto specialForm = getSpecialForm( config, diff --git a/velox/expression/FunctionCallToSpecialForm.cpp b/velox/expression/FunctionCallToSpecialForm.cpp index 46855c30d857..adf576a18cb5 100644 --- a/velox/expression/FunctionCallToSpecialForm.cpp +++ b/velox/expression/FunctionCallToSpecialForm.cpp @@ -32,6 +32,7 @@ RegistryType makeRegistry() { registry.emplace( "and", std::make_unique(true /* isAnd */)); registry.emplace("cast", std::make_unique()); + registry.emplace("try_cast", std::make_unique()); registry.emplace("coalesce", std::make_unique()); registry.emplace("if", std::make_unique()); registry.emplace( diff --git a/velox/expression/tests/CastExprTest.cpp b/velox/expression/tests/CastExprTest.cpp index 6bd5bcd57d0f..259db9cfe44a 100644 --- a/velox/expression/tests/CastExprTest.cpp +++ b/velox/expression/tests/CastExprTest.cpp @@ -27,8 +27,8 @@ #include "velox/vector/TypeAliases.h" using namespace facebook::velox; -using namespace facebook::velox::test; - +namespace facebook::velox::test { +namespace { class CastExprTest : public functions::test::CastBaseTest { protected: CastExprTest() { @@ -120,7 +120,7 @@ class CastExprTest : public functions::test::CastBaseTest { exec::EvalCtx evalCtx(&execCtx_, &dictionaryExprSet, rowVector.get()); dictionaryExprSet.eval(rows, evalCtx, result); - auto indices = ::makeIndicesInReverse(size, pool()); + auto indices = functions::test::makeIndicesInReverse(size, pool()); assertEqualVectors(wrapInDictionary(indices, size, expected), result[0]); } } @@ -691,6 +691,28 @@ TEST_F(CastExprTest, mapCast) { VELOX_CHECK(start + size - 1 < valuesSize); } } + + // Error handling. + { + auto data = makeRowVector( + {makeMapVector({{{"1", "2"}}, {{"", "1"}}})}); + auto result1 = evaluate("try_cast(c0 as map(int, int))", data); + auto result2 = evaluate("try(cast(c0 as map(int, int)))", data); + ASSERT_FALSE(result1->isNullAt(0)); + ASSERT_TRUE(result1->isNullAt(1)); + + ASSERT_FALSE(result2->isNullAt(0)); + ASSERT_TRUE(result2->isNullAt(1)); + ASSERT_THROW(evaluate("cast(c0 as map(int, int)", data), VeloxException); + } + + { + auto result = evaluate( + "try_cast(map(array_constructor('1'), array_constructor('')) as map(int, int))", + makeRowVector({makeFlatVector({1, 2})})); + ASSERT_TRUE(result->isNullAt(0)); + ASSERT_TRUE(result->isNullAt(1)); + } } TEST_F(CastExprTest, arrayCast) { @@ -751,6 +773,34 @@ TEST_F(CastExprTest, arrayCast) { VELOX_CHECK(start + size - 1 < elementsSize); } } + + // Error handling. + { + auto data = + makeRowVector({makeArrayVector({{"1", "2"}, {"", "1"}})}); + auto result1 = evaluate("try_cast(c0 as bigint[])", data); + auto result2 = evaluate("try(cast(c0 as bigint[]))", data); + + auto expected = makeNullableArrayVector({{{1, 2}}, std::nullopt}); + + assertEqualVectors(result1, expected); + assertEqualVectors(result2, expected); + + ASSERT_THROW(evaluate("cast(c0 as bigint[])", data), VeloxException); + } + + { + auto data = makeNullableNestedArrayVector({ + {{{{"1"_sv, "2"_sv}}, {{""_sv}}}}, // row0 + {{{{std::nullopt, "4"_sv}}}}, // row1 + }); + auto expected = makeNullableNestedArrayVector({ + std::nullopt, // row0 + {{{{std::nullopt, 4}}}}, // row1 + + }); + testComplexCast("c0", data, expected, true); + } } TEST_F(CastExprTest, rowCast) { @@ -801,6 +851,58 @@ TEST_F(CastExprTest, rowCast) { {"c0", "b"}, {doubleVectorNullEvery11, intVectorNullAll}, nullEvery(5)); testComplexCast("c0", rowVector, expectedRowVector); } + + // Error handling. + { + auto data = makeRowVector( + {makeFlatVector({"1", ""}), + makeFlatVector({"2", "3"})}); + + auto expected = makeRowVector( + {makeFlatVector({1, 2}), makeFlatVector({2, 3})}); + expected->setNull(1, true); + + testComplexCast("c0", data, expected, true); + } + + { + auto data = makeRowVector( + {makeArrayVector({{"1", ""}, {"3", "4"}}), + makeFlatVector({"2", ""})}); + + // expected1 is [null, struct{[3,4], ""}] + auto expected1 = makeRowVector( + {makeArrayVector({{1 /*will be null*/}, {3, 4}}), + makeFlatVector({"2" /*will be null*/, ""})}); + expected1->setNull(0, true); + + // expected2 is [struct{["1",""], 2}, null] + auto expected2 = makeRowVector( + {makeArrayVector({{"1", ""}, {"3", "4"}}), + makeFlatVector({2, 0 /*null*/})}); + expected2->setNull(1, true); + + // expected3 is [null, null] + auto expected3 = makeRowVector( + {makeArrayVector({{1}}), makeFlatVector(1)}); + expected3->resize(2); + expected3->setNull(0, true); + expected3->setNull(1, true); + + testComplexCast("c0", data, expected1, true); + testComplexCast("c0", data, expected2, true); + testComplexCast("c0", data, expected3, true); + } + + // Null handling for nested structs. + { + auto data = + makeRowVector({makeRowVector({makeFlatVector({"1", ""})})}); + auto expected = + makeRowVector({makeRowVector({makeFlatVector({1, 0})})}); + expected->setNull(1, true); + testComplexCast("c0", data, expected, true); + } } TEST_F(CastExprTest, nulls) { @@ -1135,7 +1237,7 @@ class TestingDictionaryOverConstFunction : public exec::VectorFunction { const auto size = rows.size(); auto constant = BaseVector::wrapInConstant(size, 0, args[0]); - auto indices = makeIndicesInReverse(size, context.pool()); + auto indices = functions::test::makeIndicesInReverse(size, context.pool()); auto nulls = allocateNulls(size, context.pool()); result = BaseVector::wrapInDictionary(nulls, indices, size, std::move(constant)); @@ -1168,7 +1270,8 @@ TEST_F(CastExprTest, dictionaryOverConst) { } namespace { -// Wrap input in a dictionary that point to subset of rows of the inner vector. +// Wrap input in a dictionary that point to subset of rows of the inner +// vector. class TestingDictionaryToFewerRowsFunction : public exec::VectorFunction { public: TestingDictionaryToFewerRowsFunction() {} @@ -1206,16 +1309,16 @@ TEST_F(CastExprTest, dictionaryEncodedNestedInput) { // Cast ARRAY> to ARRAY> where the outermost ARRAY // layer and innermost BIGINT layer are dictionary-encoded. This test case // ensures that when casting the ROW vector, the result ROW vector - // would not be longer than the result VARCHAR vector. In the test below, the - // ARRAY vector has 2 rows, each containing 3 elements. The ARRAY vector is - // wrapped in a dictionary layer that only references its first row, hence - // only the first 3 out of 6 rows are evaluated for the ROW and BIGINT vector. - // The BIGINT vector is also dictionary-encoded, so CastExpr produces a result - // VARCHAR vector of length 3. If the casting of the ROW vector produces a - // result ROW vector of the length of all rows, i.e., 6, the - // subsequent call to Expr::addNull() would throw due to the attempt of - // accessing the element VARCHAR vector at indices corresonding to the - // non-existent ROW at indices 3--5. + // would not be longer than the result VARCHAR vector. In the test below, + // the ARRAY vector has 2 rows, each containing 3 elements. The ARRAY vector + // is wrapped in a dictionary layer that only references its first row, + // hence only the first 3 out of 6 rows are evaluated for the ROW and BIGINT + // vector. The BIGINT vector is also dictionary-encoded, so CastExpr + // produces a result VARCHAR vector of length 3. If the casting of the ROW + // vector produces a result ROW vector of the length of all rows, + // i.e., 6, the subsequent call to Expr::addNull() would throw due to the + // attempt of accessing the element VARCHAR vector at indices corresonding + // to the non-existent ROW at indices 3--5. exec::registerVectorFunction( "add_dict", TestingDictionaryToFewerRowsFunction::signatures(), @@ -1256,3 +1359,64 @@ TEST_F(CastExprTest, smallerNonNullRowsSizeThanRows) { auto expected = makeNullableFlatVector({4, 6, 7, std::nullopt}); assertEqualVectors(expected, result); } + +TEST_F(CastExprTest, tryCastDoesNotHideInputsAndExistingErrors) { + auto test = [&](const std::string& castExprThatThrow, + const std::string& type, + const auto& data) { + ASSERT_THROW( + auto result = evaluate( + fmt::format("try_cast({} as {})", castExprThatThrow, type), data), + VeloxException); + + ASSERT_NO_THROW(evaluate( + fmt::format("try (cast ({} as {}))", castExprThatThrow, type), data)); + ASSERT_NO_THROW(evaluate(fmt::format("try_{}", castExprThatThrow), data)); + ASSERT_NO_THROW(evaluate(fmt::format("try ({})", castExprThatThrow), data)); + }; + + { + auto data = makeRowVector({makeFlatVector({1, 2, 3, 4})}); + test("cast('' as int)", "int", data); + } + + { + auto data = + makeRowVector({makeArrayVector({{"1", "", "3", "4"}})}); + test("cast(c0 as integer[])", "integer[]", data); + test("cast(map(c0, c0) as map(int, int))", "map(int, int)", data); + test( + "cast(row_constructor(c0, c0, c0) as struct(a int[], b bigint[], c float[]))", + "struct(a int[], b bigint[], c float[])", + data); + } + + { + auto data = makeRowVector( + {makeFlatVector({true, false, true, false}), + makeFlatVector({{"1", "2", "3", "4"}})}); + + ASSERT_THROW( + evaluate("switch(c0, cast('' as int), cast(c1 as integer))", data), + VeloxException); + + ASSERT_THROW( + evaluate("switch(c0, cast('' as int), try_cast(c1 as integer))", data), + VeloxException); + { + auto result = evaluate( + "try(switch(c0, cast('' as int), cast(c1 as integer)))", data); + ASSERT_TRUE(result->isNullAt(0)); + ASSERT_TRUE(result->isNullAt(2)); + } + + { + auto result = evaluate( + "try(switch(c0, try_cast('' as int), cast(c1 as integer)))", data); + ASSERT_TRUE(result->isNullAt(0)); + ASSERT_TRUE(result->isNullAt(2)); + } + } +} +} // namespace +} // namespace facebook::velox::test diff --git a/velox/expression/tests/ExpressionVerifier.cpp b/velox/expression/tests/ExpressionVerifier.cpp index 7adf29b8d5c3..ab588381ae7d 100644 --- a/velox/expression/tests/ExpressionVerifier.cpp +++ b/velox/expression/tests/ExpressionVerifier.cpp @@ -19,6 +19,7 @@ #include "velox/expression/Expr.h" #include "velox/vector/VectorSaver.h" #include "velox/vector/tests/utils/VectorMaker.h" +#include "velox/vector/tests/utils/VectorTestBase.h" namespace facebook::velox::test { @@ -41,6 +42,15 @@ void logRowVector(const RowVectorPtr& rowVector) { } } } +namespace { +auto createCopy(const VectorPtr& input) { + VectorPtr result; + SelectivityVector rows(input->size()); + BaseVector::ensureWritable(rows, input->type(), input->pool(), result); + result->copy(input.get(), rows, nullptr); + return result; +} +} // namespace void compareVectors( const VectorPtr& left, @@ -147,9 +157,13 @@ ResultOrError ExpressionVerifier::verify( LOG(INFO) << "Modified inputs for common eval path: "; logRowVector(inputRowVector); } - exec::EvalCtx evalCtxCommon(execCtx_, &exprSetCommon, inputRowVector.get()); + auto copy = createCopy(inputRowVector); + + exec::EvalCtx evalCtxCommon(execCtx_, &exprSetCommon, inputRowVector.get()); exprSetCommon.eval(rows, evalCtxCommon, commonEvalResult); + assertEqualVectors(copy, inputRowVector); + } catch (const VeloxUserError&) { if (!canThrow) { LOG(ERROR) @@ -175,7 +189,10 @@ ResultOrError ExpressionVerifier::verify( exec::EvalCtx evalCtxSimplified( execCtx_, &exprSetSimplified, rowVector.get()); + auto copy = createCopy(rowVector); exprSetSimplified.eval(rows, evalCtxSimplified, simplifiedEvalResult); + assertEqualVectors(copy, rowVector); + } catch (const VeloxUserError&) { exceptionSimplifiedPtr = std::current_exception(); } catch (...) { diff --git a/velox/expression/tests/FacebookPrestoExpressionFuzzerTest.cpp b/velox/expression/tests/FacebookPrestoExpressionFuzzerTest.cpp index 9c799aaca05f..fe01163a8597 100644 --- a/velox/expression/tests/FacebookPrestoExpressionFuzzerTest.cpp +++ b/velox/expression/tests/FacebookPrestoExpressionFuzzerTest.cpp @@ -25,9 +25,14 @@ #include "velox/functions/facebook/prestosql/Register.h" #include "velox/functions/prestosql/registration/RegistrationFunctions.h" +DEFINE_bool( + include_fb_only, + true, + "If true fb only functions are included in the test"); + DEFINE_int64( seed, - 123456, + 0, "Initial seed for random number generator " "(use it to reproduce previous results)."); @@ -44,22 +49,14 @@ DEFINE_string( "Comma-separated list of special forms to use in generated expression. " "Supported special forms: and, or, coalesce, if, switch, cast."); -int main(int argc, char** argv) { - facebook::velox::functions::prestosql::registerAllScalarFacebookOnlyFunctions( - ""); - facebook::velox::functions::prestosql::registerAllScalarFunctions(); - - ::testing::InitGoogleTest(&argc, argv); - - // Calls common init functions in the necessary order, initializing - // singletons, installing proper signal handlers for better debugging - // experience, and initialize glog and gflags. - folly::init(&argc, &argv); - - // The following list are the Spark UDFs that hit issues - // For rlike you need the following combo in the only list: - // rlike, md5 and upper +class FacebookPrestoExpressionFuzzerTest : public testing::Test {}; +TEST_F(FacebookPrestoExpressionFuzzerTest, test) { + if (FLAGS_include_fb_only) { + facebook::velox::functions::prestosql:: + registerAllScalarFacebookOnlyFunctions(""); + } + facebook::velox::functions::prestosql::registerAllScalarFunctions(); // TODO: List of the functions that at some point crash or fail and need to // be fixed before we can enable. std::unordered_set skipFunctions = { @@ -76,7 +73,7 @@ int main(int argc, char** argv) { // https://github.com/facebookincubator/velox/issues/5398 "concat", }; - - return FuzzerRunner::run( - FLAGS_only, FLAGS_seed, skipFunctions, FLAGS_special_forms); + size_t initialSeed = FLAGS_seed == 0 ? std::time(nullptr) : FLAGS_seed; + return FuzzerRunner::runFromGtest( + FLAGS_only, initialSeed, skipFunctions, FLAGS_special_forms); } diff --git a/velox/expression/tests/FuzzerRunner.h b/velox/expression/tests/FuzzerRunner.h index 3c4c0ecd7c39..5d5e5bf65daa 100644 --- a/velox/expression/tests/FuzzerRunner.h +++ b/velox/expression/tests/FuzzerRunner.h @@ -135,11 +135,18 @@ class FuzzerRunner { size_t seed, const std::unordered_set& skipFunctions, const std::string& specialForms) { + runFromGtest(onlyFunctions, seed, skipFunctions, specialForms); + return RUN_ALL_TESTS(); + } + + static void runFromGtest( + const std::string& onlyFunctions, + size_t seed, + const std::unordered_set& skipFunctions, + const std::string& specialForms) { auto signatures = facebook::velox::getFunctionSignatures(); appendSpecialForms(specialForms, signatures); facebook::velox::test::expressionFuzzer( filterSignatures(signatures, onlyFunctions, skipFunctions), seed); - // Calling gtest here so that it can be recognized as tests in CI systems. - return RUN_ALL_TESTS(); } }; diff --git a/velox/functions/lib/RowsTranslationUtil.h b/velox/functions/lib/RowsTranslationUtil.h index 93f9b56595fc..bd374bb9529b 100644 --- a/velox/functions/lib/RowsTranslationUtil.h +++ b/velox/functions/lib/RowsTranslationUtil.h @@ -13,6 +13,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +#pragma once + #include "velox/common/base/Nulls.h" #include "velox/vector/BaseVector.h" #include "velox/vector/SelectivityVector.h" diff --git a/velox/functions/prestosql/ArrayFunctions.h b/velox/functions/prestosql/ArrayFunctions.h index 6d00dcb57315..1b8030fb2fc9 100644 --- a/velox/functions/prestosql/ArrayFunctions.h +++ b/velox/functions/prestosql/ArrayFunctions.h @@ -697,35 +697,4 @@ struct ArrayUnionFunction { } }; -template -struct ArrayUnionFunctionString { - VELOX_DEFINE_FUNCTION_TYPES(T); - - static constexpr int32_t reuse_strings_from_arg = 0; - - // String version that avoids copy of strings. - FOLLY_ALWAYS_INLINE void call( - out_type>& out, - const arg_type>& inputArray1, - const arg_type>& inputArray2) { - folly::F14FastSet elementSet; - bool nullAdded = false; - auto addItems = [&](auto& inputArray) { - for (const auto& item : inputArray) { - if (item.has_value()) { - if (elementSet.insert(item.value()).second) { - auto& newItem = out.add_item(); - newItem.setNoCopy(item.value()); - } - } else if (!nullAdded) { - nullAdded = true; - out.add_null(); - } - } - }; - addItems(inputArray1); - addItems(inputArray2); - } -}; - } // namespace facebook::velox::functions diff --git a/velox/functions/prestosql/InPredicate.cpp b/velox/functions/prestosql/InPredicate.cpp index 2383cb8cd8d2..d979fbd49e9d 100644 --- a/velox/functions/prestosql/InPredicate.cpp +++ b/velox/functions/prestosql/InPredicate.cpp @@ -86,6 +86,43 @@ std::pair, bool> createBigintValuesFilter( return {common::createBigintValues(values, nullAllowed), false}; } +// Cast double to Int64 and reuse Int64 filters +template +std::pair, bool> +createFloatingPointValuesFilter( + const std::vector& inputArgs) { + auto valuesPair = toValues(inputArgs); + if (!valuesPair.has_value()) { + return {nullptr, false}; + } + + auto& values = valuesPair.value().first; + bool nullAllowed = valuesPair.value().second; + + if (values.empty() && nullAllowed) { + return {nullptr, true}; + } + VELOX_USER_CHECK( + !values.empty(), + "IN predicate expects at least one non-null value in the in-list"); + + if (values.size() == 1) { + return { + std::make_unique>( + values[0], false, false, values[0], false, false, nullAllowed), + false}; + } + + std::vector intValues(values.size()); + for (size_t i = 0; i < values.size(); ++i) { + if (values[i] == double{}) { + values[i] = 0; + } + intValues[i] = reinterpret_cast(values[i]); + } + return {common::createBigintValues(intValues, nullAllowed), false}; +} + // See createBigintValuesFilter. std::pair, bool> createBytesValuesFilter( const std::vector& inputArgs) { @@ -140,6 +177,9 @@ class InPredicate : public exec::VectorFunction { case TypeKind::TINYINT: filter = createBigintValuesFilter(inputArgs); break; + case TypeKind::DOUBLE: + filter = createFloatingPointValuesFilter(inputArgs); + break; case TypeKind::BOOLEAN: // Hack: using BIGINT filter for bool, which is essentially "int1_t". filter = createBigintValuesFilter(inputArgs); @@ -194,6 +234,19 @@ class InPredicate : public exec::VectorFunction { return filter_->testInt64(value); }); break; + case TypeKind::DOUBLE: + applyTyped(rows, input, context, result, [&](double value) { + auto* derived = + dynamic_cast*>(filter_.get()); + if (derived) { + return filter_->testDouble(value); + } + if (value == double{}) { + value = 0; + } + return filter_->testInt64(reinterpret_cast(value)); + }); + break; case TypeKind::BOOLEAN: applyTyped(rows, input, context, result, [&](bool value) { return filter_->testInt64(value); @@ -224,6 +277,7 @@ class InPredicate : public exec::VectorFunction { "bigint", "varchar", "varbinary", + "double", "date"}) { signatures.emplace_back(exec::FunctionSignatureBuilder() .returnType("boolean") diff --git a/velox/functions/prestosql/Probability.h b/velox/functions/prestosql/Probability.h index 3d58a7683900..b500b8cb904d 100644 --- a/velox/functions/prestosql/Probability.h +++ b/velox/functions/prestosql/Probability.h @@ -18,6 +18,8 @@ #include "boost/math/distributions/beta.hpp" #include "boost/math/distributions/binomial.hpp" #include "boost/math/distributions/cauchy.hpp" +#include "boost/math/distributions/chi_squared.hpp" +#include "boost/math/distributions/fisher_f.hpp" #include "velox/common/base/Exceptions.h" #include "velox/functions/Macros.h" @@ -136,5 +138,33 @@ struct InverseBetaCDFFunction { } }; +template +struct ChiSquaredCDFFunction { + VELOX_DEFINE_FUNCTION_TYPES(T); + + FOLLY_ALWAYS_INLINE void call(double& result, double df, double value) { + VELOX_USER_CHECK_GT(df, 0, "df must be greater than 0"); + VELOX_USER_CHECK_GE(value, 0, "value must non-negative"); + + boost::math::chi_squared_distribution<> dist(df); + result = boost::math::cdf(dist, value); + } +}; + +template +struct FCDFFunction { + VELOX_DEFINE_FUNCTION_TYPES(T); + + FOLLY_ALWAYS_INLINE void + call(double& result, double df1, double df2, double value) { + VELOX_USER_CHECK_GE(value, 0, "value must non-negative"); + VELOX_USER_CHECK_GT(df1, 0, "numerator df must be greater than 0"); + VELOX_USER_CHECK_GT(df2, 0, "denominator df must be greater than 0"); + + boost::math::fisher_f_distribution<> dist(df1, df2); + result = boost::math::cdf(dist, value); + } +}; + } // namespace } // namespace facebook::velox::functions diff --git a/velox/functions/prestosql/SIMDJsonFunctions.h b/velox/functions/prestosql/SIMDJsonFunctions.h index e39d6ed5c13c..f6e727e4e147 100644 --- a/velox/functions/prestosql/SIMDJsonFunctions.h +++ b/velox/functions/prestosql/SIMDJsonFunctions.h @@ -205,4 +205,119 @@ struct SIMDJsonExtractScalarFunction { } }; +template +struct SIMDJsonExtractFunction { + VELOX_DEFINE_FUNCTION_TYPES(T); + + bool call( + out_type& result, + const arg_type& json, + const arg_type& jsonPath) { + static constexpr std::string_view kNullString{"null"}; + std::string results; + size_t resultSize = 0; + auto consumer = [&results, &resultSize](auto& v) { + // Add the separator for the JSON array. + if (resultSize++ > 0) { + results += ","; + } + // We could just convert v to a string using to_json_string directly, but + // in that case the JSON wouldn't be parsed (it would just return the + // contents directly) and we might miss invalid JSON. + switch (v.type()) { + case simdjson::ondemand::json_type::object: + results += simdjson::to_json_string(v.get_object()).value(); + break; + case simdjson::ondemand::json_type::array: + results += simdjson::to_json_string(v.get_array()).value(); + break; + case simdjson::ondemand::json_type::string: + case simdjson::ondemand::json_type::number: + case simdjson::ondemand::json_type::boolean: + results += simdjson::to_json_string(v).value(); + break; + case simdjson::ondemand::json_type::null: + results += kNullString; + break; + } + }; + + if (!simdJsonExtract(json, jsonPath, consumer)) { + // If there's an error parsing the JSON, return null. + return false; + } + + if (resultSize == 0) { + // If the path didn't map to anything in the JSON object, return null. + return false; + } + + if (resultSize == 1) { + if (results == kNullString) { + // If there was only one value mapped to by the path and it was null, + // return null directly. + return false; + } + + // If there was only one value mapped to by the path, don't wrap it in an + // array. + result.copy_from(results); + } else { + // Add the square brackets to make it a valid JSON array. + result.copy_from("[" + results + "]"); + } + return true; + } +}; + +template +struct SIMDJsonSizeFunction { + VELOX_DEFINE_FUNCTION_TYPES(T); + + FOLLY_ALWAYS_INLINE bool call( + int64_t& result, + const arg_type& json, + const arg_type& jsonPath) { + size_t resultCount = 0; + size_t singleResultSize = 0; + auto consumer = [&resultCount, &singleResultSize](auto& v) { + resultCount++; + + if (resultCount == 1) { + // We only need the size of the actual object if there's only one + // returned, if multiple are returned we use the number of objects + // returned instead. + switch (v.type()) { + case simdjson::ondemand::json_type::object: + singleResultSize = v.count_fields().value(); + break; + case simdjson::ondemand::json_type::array: + singleResultSize = v.count_elements().value(); + break; + case simdjson::ondemand::json_type::string: + case simdjson::ondemand::json_type::number: + case simdjson::ondemand::json_type::boolean: + case simdjson::ondemand::json_type::null: + singleResultSize = 0; + break; + } + } + }; + + if (!simdJsonExtract(json, jsonPath, consumer)) { + // If there's an error parsing the JSON, return null. + return false; + } + + if (resultCount == 0) { + // If the path didn't map to anything in the JSON object, return null. + return false; + } + + result = resultCount == 1 ? singleResultSize : resultCount; + + return true; + } +}; + } // namespace facebook::velox::functions diff --git a/velox/functions/prestosql/aggregates/MinMaxAggregates.cpp b/velox/functions/prestosql/aggregates/MinMaxAggregates.cpp index b9ee996e22be..da44590fda6c 100644 --- a/velox/functions/prestosql/aggregates/MinMaxAggregates.cpp +++ b/velox/functions/prestosql/aggregates/MinMaxAggregates.cpp @@ -180,6 +180,31 @@ class MinAggregate : public MinMaxAggregate { } } + bool supportsToIntermediate() const override { + return true; + } + + void toIntermediate( + const SelectivityVector& rows, + std::vector& args, + VectorPtr& result) const override { + const auto& input = args[0]; + if (rows.isAllSelected()) { + result = input; + return; + } + + auto* pool = BaseAggregate::allocator_->pool(); + + result = BaseVector::create(input->type(), rows.size(), pool); + result->copy(input.get(), 0, 0, rows.size()); + + // Set result to NULL for rows that are masked out. + BufferPtr nulls = allocateNulls(rows.size(), pool, bits::kNull); + rows.clearNulls(nulls); + result->setNulls(nulls); + } + void addRawInput( char** groups, const SelectivityVector& rows, @@ -258,6 +283,33 @@ class NonNumericMinMaxAggregateBase : public exec::Aggregate { } } + bool supportsToIntermediate() const override { + return true; + } + + void toIntermediate( + const SelectivityVector& rows, + std::vector& args, + VectorPtr& result) const override { + const auto& input = args[0]; + if (rows.isAllSelected()) { + result = input; + return; + } + + auto* pool = allocator_->pool(); + + // Set result to NULL for rows that are masked out. + BufferPtr nulls = allocateNulls(rows.size(), pool, bits::kNull); + rows.clearNulls(nulls); + + BufferPtr indices = allocateIndices(rows.size(), pool); + auto* rawIndices = indices->asMutable(); + std::iota(rawIndices, rawIndices + rows.size(), 0); + + result = BaseVector::wrapInDictionary(nulls, indices, rows.size(), input); + } + void extractValues(char** groups, int32_t numGroups, VectorPtr* result) override { VELOX_CHECK(result); diff --git a/velox/functions/prestosql/aggregates/tests/ArrayAggTest.cpp b/velox/functions/prestosql/aggregates/tests/ArrayAggTest.cpp index 232e60d1f64d..769bfd224414 100644 --- a/velox/functions/prestosql/aggregates/tests/ArrayAggTest.cpp +++ b/velox/functions/prestosql/aggregates/tests/ArrayAggTest.cpp @@ -60,6 +60,15 @@ TEST_F(ArrayAggTest, groupBy) { {"c0"}, {"array_agg(a)"}, "SELECT c0, array_agg(a) FROM tmp GROUP BY c0"); + + // Having one function supporting toIntermediate and one does not, make sure + // the row container is recreated with only the function wihtout + // toIntermediate support. + testAggregations( + batches, + {"c0"}, + {"array_agg(a)", "max(c0)"}, + "SELECT c0, array_agg(a), max(c0) FROM tmp GROUP BY c0"); } TEST_F(ArrayAggTest, sortedGroupBy) { diff --git a/velox/functions/prestosql/benchmarks/JsonExprBenchmark.cpp b/velox/functions/prestosql/benchmarks/JsonExprBenchmark.cpp index fb0952e5992f..b798c2640f9e 100644 --- a/velox/functions/prestosql/benchmarks/JsonExprBenchmark.cpp +++ b/velox/functions/prestosql/benchmarks/JsonExprBenchmark.cpp @@ -51,6 +51,14 @@ class JsonBenchmark : public velox::functions::test::FunctionBenchmarkBase { {"folly_json_extract_scalar"}); registerFunction( {"simd_json_extract_scalar"}); + registerFunction( + {"folly_json_extract"}); + registerFunction( + {"simd_json_extract"}); + registerFunction( + {"folly_json_size"}); + registerFunction( + {"simd_json_size"}); } std::string prepareData(int jsonSize) { @@ -203,6 +211,42 @@ void SIMDJsonExtractScalar(int iter, int vectorSize, int jsonSize) { iter, vectorSize, "simd_json_extract_scalar", json, "$.key[7].k1"); } +void FollyJsonExtract(int iter, int vectorSize, int jsonSize) { + folly::BenchmarkSuspender suspender; + JsonBenchmark benchmark; + auto json = benchmark.prepareData(jsonSize); + suspender.dismiss(); + benchmark.runWithJsonExtract( + iter, vectorSize, "folly_json_extract", json, "$.key[*].k1"); +} + +void SIMDJsonExtract(int iter, int vectorSize, int jsonSize) { + folly::BenchmarkSuspender suspender; + JsonBenchmark benchmark; + auto json = benchmark.prepareData(jsonSize); + suspender.dismiss(); + benchmark.runWithJsonExtract( + iter, vectorSize, "simd_json_extract", json, "$.key[*].k1"); +} + +void FollyJsonSize(int iter, int vectorSize, int jsonSize) { + folly::BenchmarkSuspender suspender; + JsonBenchmark benchmark; + auto json = benchmark.prepareData(jsonSize); + suspender.dismiss(); + benchmark.runWithJsonExtract( + iter, vectorSize, "folly_json_size", json, "$.key"); +} + +void SIMDJsonSize(int iter, int vectorSize, int jsonSize) { + folly::BenchmarkSuspender suspender; + JsonBenchmark benchmark; + auto json = benchmark.prepareData(jsonSize); + suspender.dismiss(); + benchmark.runWithJsonExtract( + iter, vectorSize, "simd_json_size", json, "$.key"); +} + BENCHMARK_DRAW_LINE(); BENCHMARK_NAMED_PARAM(FollyIsJsonScalar, 100_iters_10bytes_size, 100, 10); @@ -368,6 +412,64 @@ BENCHMARK_RELATIVE_NAMED_PARAM( 10000); BENCHMARK_DRAW_LINE(); +BENCHMARK_DRAW_LINE(); +BENCHMARK_NAMED_PARAM(FollyJsonExtract, 100_iters_10bytes_size, 100, 10); +BENCHMARK_RELATIVE_NAMED_PARAM( + SIMDJsonExtract, + 100_iters_10bytes_size, + 100, + 10); +BENCHMARK_DRAW_LINE(); + +BENCHMARK_NAMED_PARAM(FollyJsonExtract, 100_iters_100bytes_size, 100, 100); +BENCHMARK_RELATIVE_NAMED_PARAM( + SIMDJsonExtract, + 100_iters_100bytes_size, + 100, + 100); +BENCHMARK_DRAW_LINE(); + +BENCHMARK_NAMED_PARAM(FollyJsonExtract, 100_iters_1000bytes_size, 100, 1000); +BENCHMARK_RELATIVE_NAMED_PARAM( + SIMDJsonExtract, + 100_iters_1000bytes_size, + 100, + 1000); +BENCHMARK_DRAW_LINE(); + +BENCHMARK_NAMED_PARAM(FollyJsonExtract, 100_iters_10000bytes_size, 100, 10000); +BENCHMARK_RELATIVE_NAMED_PARAM( + SIMDJsonExtract, + 100_iters_10000bytes_size, + 100, + 10000); +BENCHMARK_DRAW_LINE(); + +BENCHMARK_DRAW_LINE(); +BENCHMARK_NAMED_PARAM(FollyJsonSize, 100_iters_10bytes_size, 100, 10); +BENCHMARK_RELATIVE_NAMED_PARAM(SIMDJsonSize, 100_iters_10bytes_size, 100, 10); +BENCHMARK_DRAW_LINE(); + +BENCHMARK_NAMED_PARAM(FollyJsonSize, 100_iters_100bytes_size, 100, 100); +BENCHMARK_RELATIVE_NAMED_PARAM(SIMDJsonSize, 100_iters_100bytes_size, 100, 100); +BENCHMARK_DRAW_LINE(); + +BENCHMARK_NAMED_PARAM(FollyJsonSize, 100_iters_1000bytes_size, 100, 1000); +BENCHMARK_RELATIVE_NAMED_PARAM( + SIMDJsonSize, + 100_iters_1000bytes_size, + 100, + 1000); +BENCHMARK_DRAW_LINE(); + +BENCHMARK_NAMED_PARAM(FollyJsonSize, 100_iters_10000bytes_size, 100, 10000); +BENCHMARK_RELATIVE_NAMED_PARAM( + SIMDJsonSize, + 100_iters_10000bytes_size, + 100, + 10000); +BENCHMARK_DRAW_LINE(); + } // namespace } // namespace facebook::velox::functions::prestosql diff --git a/velox/functions/prestosql/registration/ArithmeticFunctionsRegistration.cpp b/velox/functions/prestosql/registration/ArithmeticFunctionsRegistration.cpp index 64d2a4e53b23..5898a2fd77fa 100644 --- a/velox/functions/prestosql/registration/ArithmeticFunctionsRegistration.cpp +++ b/velox/functions/prestosql/registration/ArithmeticFunctionsRegistration.cpp @@ -109,8 +109,12 @@ void registerSimpleFunctions(const std::string& prefix) { {prefix + "binomial_cdf"}); registerFunction( {prefix + "cauchy_cdf"}); + registerFunction( + {prefix + "chi_squared_cdf"}); registerFunction( {prefix + "inverse_beta_cdf"}); + registerFunction( + {prefix + "f_cdf"}); } } // namespace diff --git a/velox/functions/prestosql/registration/ArrayFunctionsRegistration.cpp b/velox/functions/prestosql/registration/ArrayFunctionsRegistration.cpp index 0e4604b200e3..b1ffd10271ee 100644 --- a/velox/functions/prestosql/registration/ArrayFunctionsRegistration.cpp +++ b/velox/functions/prestosql/registration/ArrayFunctionsRegistration.cpp @@ -194,11 +194,6 @@ void registerArrayFunctions(const std::string& prefix) { Array>, int64_t>({prefix + "trim_array"}); - registerFunction< - ArrayUnionFunctionString, - Array, - Array, - Array>({prefix + "array_union"}); registerArrayUnionFunctions(prefix); registerArrayUnionFunctions(prefix); registerArrayUnionFunctions(prefix); diff --git a/velox/functions/prestosql/registration/JsonFunctionsRegistration.cpp b/velox/functions/prestosql/registration/JsonFunctionsRegistration.cpp index bffda97f486a..763588418833 100644 --- a/velox/functions/prestosql/registration/JsonFunctionsRegistration.cpp +++ b/velox/functions/prestosql/registration/JsonFunctionsRegistration.cpp @@ -32,9 +32,9 @@ void registerJsonFunctions(const std::string& prefix) { registerFunction( {prefix + "json_extract_scalar"}); - registerFunction( + registerFunction( {prefix + "json_extract"}); - registerFunction( + registerFunction( {prefix + "json_extract"}); registerFunction( @@ -59,9 +59,9 @@ void registerJsonFunctions(const std::string& prefix) { registerFunction( {prefix + "json_array_contains"}); - registerFunction( + registerFunction( {prefix + "json_size"}); - registerFunction( + registerFunction( {prefix + "json_size"}); VELOX_REGISTER_VECTOR_FUNCTION(udf_json_format, prefix + "json_format"); diff --git a/velox/functions/prestosql/tests/InPredicateTest.cpp b/velox/functions/prestosql/tests/InPredicateTest.cpp index 3f54db94962f..026ca2cb8263 100644 --- a/velox/functions/prestosql/tests/InPredicateTest.cpp +++ b/velox/functions/prestosql/tests/InPredicateTest.cpp @@ -406,3 +406,157 @@ TEST_F(InPredicateTest, reusableResult) { auto expected = makeFlatVector({false, true, true, false}); assertEqualVectors(expected, actual); } + +TEST_F(InPredicateTest, doubleWithZero) { + // zero and negative zero, FloatingPointRange + auto input = makeRowVector({ + makeNullableFlatVector({0.0, -0.0}, DOUBLE()), + }); + auto predicate = "c0 IN ( 0.0 )"; + auto result = evaluate>(predicate, input); + auto expected = makeNullableFlatVector({true, true}); + assertEqualVectors(expected, result); + + // zero and negative zero, BigintValuesUsingHashTable, 0 in valuesList + input = makeRowVector({ + makeNullableFlatVector({0.0, -0.0}, DOUBLE()), + }); + predicate = "c0 IN ( 0.0, 1.2, 2.3 )"; + result = evaluate>(predicate, input); + expected = makeNullableFlatVector({true, true}); + assertEqualVectors(expected, result); + + // zero and negative zero, BigintValuesUsingHashTable, -0 in valuesList + input = makeRowVector({ + makeNullableFlatVector({0.0, -0.0}, DOUBLE()), + }); + predicate = "c0 IN ( -0.0, 1.2, 2.3, null )"; + result = evaluate>(predicate, input); + expected = makeNullableFlatVector({true, true}); + assertEqualVectors(expected, result); + + // TODO : zero and negative zero, BigintValuesUsingBitmask, depending on + // another fix +} + +TEST_F(InPredicateTest, double) { + // No Null + auto input = makeRowVector({ + makeNullableFlatVector({1.2, 2.3, 3.4}, DOUBLE()), + }); + std::string predicate = "c0 IN ( 1.2, 2.3, 3.4 )"; + auto expected = makeConstant(true, input->size()); + auto result = evaluate>(predicate, input); + assertEqualVectors(expected, result); + + // InList has Null + // Since there is only one non-null float, it will use FloatingPointRange + input = makeRowVector({ + makeNullableFlatVector({1.2, 2.3, 3.4}, DOUBLE()), + }); + predicate = "c0 IN ( 1.2, null )"; + result = evaluate>(predicate, input); + expected = makeNullableFlatVector({true, std::nullopt, std::nullopt}); + assertEqualVectors(expected, result); + + // InList has Null + // Multiple non-null, using BigintValuesUsingHashTable + predicate = "c0 IN ( 1.2, 2.3, null )"; + result = evaluate>(predicate, input); + expected = makeNullableFlatVector({true, true, std::nullopt}); + assertEqualVectors(expected, result); + + // Value(input) has NULL + input = makeRowVector({ + makeNullableFlatVector({1.2, 1.3, std::nullopt}, DOUBLE()), + }); + predicate = "c0 IN ( 1.2, 2.3 )"; + result = evaluate>(predicate, input); + expected = makeNullableFlatVector({true, false, std::nullopt}); + assertEqualVectors(expected, result); + + // NaN + input = makeRowVector({ + makeNullableFlatVector({std::nan("")}, DOUBLE()), + }); + predicate = "c0 IN ( 1.2, 2.3 )"; + result = evaluate>(predicate, input); + expected = makeNullableFlatVector({false}); + assertEqualVectors(expected, result); + + predicate = "c0 IN ( 1.2, null )"; + result = evaluate>(predicate, input); + expected = makeNullableFlatVector({std::nullopt}); + assertEqualVectors(expected, result); + + // Infinity + input = makeRowVector({ + makeNullableFlatVector( + {std::numeric_limits::infinity()}, DOUBLE()), + }); + predicate = "c0 IN ( 1.2, 2.3 )"; + result = evaluate>(predicate, input); + expected = makeNullableFlatVector({false}); + assertEqualVectors(expected, result); +} + +TEST_F(InPredicateTest, float) { + // No Null + auto input = makeRowVector({ + makeNullableFlatVector({1.2, 2.3, 3.4}, REAL()), + }); + std::string predicate = + "c0 IN ( CAST(1.2 AS REAL), CAST(2.3 AS REAL), CAST(3.4 AS REAL) )"; + auto expected = makeConstant(true, input->size()); + auto result = evaluate>(predicate, input); + assertEqualVectors(expected, result); + + /// InList has Null + // Since there is only one non-null float, it will use FloatingPointRange + predicate = "c0 IN ( CAST(1.2 AS REAL), null )"; + result = evaluate>(predicate, input); + expected = makeNullableFlatVector({true, std::nullopt, std::nullopt}); + assertEqualVectors(expected, result); + + // InList has Null + // Multiple non-null, using BigintValuesUsingHashTable + // TODO: CAST(1.2 AS REAL), CAST(1.2 AS REAL) captured a bug in + // BigintValuesUsingBitmask, it will be fixed in separate diff + predicate = "c0 IN ( CAST(1.2 AS REAL), CAST(1.3 AS REAL), null )"; + result = evaluate>(predicate, input); + expected = makeNullableFlatVector({true, std::nullopt, std::nullopt}); + assertEqualVectors(expected, result); + + // Value(input) has NULL + input = makeRowVector({ + makeNullableFlatVector({1.2, 2.3, std::nullopt}, REAL()), + }); + predicate = "c0 IN ( CAST(1.2 AS REAL), CAST(1.3 AS REAL) )"; + result = evaluate>(predicate, input); + expected = makeNullableFlatVector({true, false, std::nullopt}); + assertEqualVectors(expected, result); + + // NaN + input = makeRowVector({ + makeNullableFlatVector({std::nan("")}, REAL()), + }); + predicate = "c0 IN ( CAST(1.2 AS REAL), CAST(1.3 AS REAL) )"; + result = evaluate>(predicate, input); + expected = makeNullableFlatVector({false}); + assertEqualVectors(expected, result); + + predicate = "c0 IN ( CAST(1.2 AS REAL), null )"; + result = evaluate>(predicate, input); + expected = makeNullableFlatVector({std::nullopt}); + assertEqualVectors(expected, result); + + // Infinity + input = makeRowVector({ + makeNullableFlatVector( + {std::numeric_limits::infinity()}, REAL()), + }); + predicate = "c0 IN ( CAST(1.2 AS REAL), CAST(1.3 AS REAL) )"; + result = evaluate>(predicate, input); + expected = makeNullableFlatVector({false}); + assertEqualVectors(expected, result); +} diff --git a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp index ef59bb7b1638..68eb37d39cd3 100644 --- a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp @@ -538,10 +538,10 @@ TEST_F(JsonFunctionsTest, jsonExtract) { }; EXPECT_EQ( - "{\"x\":{\"a\":1,\"b\":2}}", + "{\"x\": {\"a\" : 1, \"b\" : 2} }", jsonExtract("{\"x\": {\"a\" : 1, \"b\" : 2} }", "$")); EXPECT_EQ( - "{\"a\":1,\"b\":2}", + "{\"a\" : 1, \"b\" : 2}", jsonExtract("{\"x\": {\"a\" : 1, \"b\" : 2} }", "$.x")); EXPECT_EQ("1", jsonExtract("{\"x\": {\"a\" : 1, \"b\" : 2} }", "$.x.a")); EXPECT_EQ( diff --git a/velox/functions/prestosql/tests/ProbabilityTest.cpp b/velox/functions/prestosql/tests/ProbabilityTest.cpp index 5b31d20f6947..2906992cf5cb 100644 --- a/velox/functions/prestosql/tests/ProbabilityTest.cpp +++ b/velox/functions/prestosql/tests/ProbabilityTest.cpp @@ -221,5 +221,64 @@ TEST_F(ProbabilityTest, invBetaCDF) { VELOX_ASSERT_THROW(invBetaCDF(3, 5, 1.1), "p must be in the interval [0, 1]"); } +TEST_F(ProbabilityTest, chiSquaredCDF) { + const auto chiSquaredCDF = [&](std::optional df, + std::optional value) { + return evaluateOnce("chi_squared_cdf(c0, c1)", df, value); + }; + + EXPECT_EQ(chiSquaredCDF(3, 0.0), 0.0); + EXPECT_EQ(chiSquaredCDF(3, 1.0), 0.1987480430987992); + EXPECT_EQ(chiSquaredCDF(3, 2.5), 0.52470891665697938); + EXPECT_EQ(chiSquaredCDF(3, 4), 0.73853587005088939); + // Invalid inputs + VELOX_ASSERT_THROW(chiSquaredCDF(-3, 0.3), "df must be greater than 0"); + VELOX_ASSERT_THROW(chiSquaredCDF(3, -10), "value must non-negative"); +} + +TEST_F(ProbabilityTest, fCDF) { + const auto fCDF = [&](std::optional df1, + std::optional df2, + std::optional value) { + return evaluateOnce("f_cdf(c0, c1, c2)", df1, df2, value); + }; + + EXPECT_EQ(fCDF(2.0, 5.0, 0.0), 0.0); + EXPECT_EQ(fCDF(2.0, 5.0, 0.7988), 0.50001145221750731); + EXPECT_EQ(fCDF(2.0, 5.0, 3.7797), 0.89999935988961155); + + EXPECT_EQ(fCDF(kDoubleMax, 5.0, 3.7797), 1); + EXPECT_EQ(fCDF(1, kDoubleMax, 97.1), 1); + EXPECT_EQ(fCDF(82.6, 901.10, kDoubleMax), 1); + EXPECT_EQ(fCDF(12.12, 4.2015, kDoubleMin), 0); + EXPECT_EQ(fCDF(0.4422, kDoubleMin, 0.697), 7.9148959162596482e-306); + EXPECT_EQ(fCDF(kDoubleMin, 50.620, 4), 1); + EXPECT_EQ(fCDF(kBigIntMax, 5.0, 3.7797), 0.93256230095450132); + EXPECT_EQ(fCDF(76.901, kBigIntMax, 77.97), 1); + EXPECT_EQ(fCDF(2.0, 5.0, kBigIntMax), 1); + + EXPECT_EQ(fCDF(2.0, 5.0, std::nullopt), std::nullopt); + EXPECT_EQ(fCDF(2.0, std::nullopt, 3.7797), std::nullopt); + EXPECT_EQ(fCDF(std::nullopt, 5.0, 3.7797), std::nullopt); + + // Test invalid inputs for df1. + VELOX_ASSERT_THROW(fCDF(0, 3, 0.5), "numerator df must be greater than 0"); + VELOX_ASSERT_THROW( + fCDF(kBigIntMin, 5.0, 3.7797), "numerator df must be greater than 0"); + + // Test invalid inputs for df2. + VELOX_ASSERT_THROW(fCDF(3, 0, 0.5), "denominator df must be greater than 0"); + VELOX_ASSERT_THROW( + fCDF(2.0, kBigIntMin, 3.7797), "denominator df must be greater than 0"); + + // Test invalid inputs for value. + VELOX_ASSERT_THROW(fCDF(3, 5, -0.1), "value must non-negative"); + VELOX_ASSERT_THROW(fCDF(2.0, 5.0, kBigIntMin), "value must non-negative"); + + // Test a combination of invalid inputs. + VELOX_ASSERT_THROW(fCDF(-1.2, 0, -0.1), "value must non-negative"); + VELOX_ASSERT_THROW(fCDF(1, -kInf, -0.1), "value must non-negative"); +} + } // namespace } // namespace facebook::velox diff --git a/velox/functions/sparksql/String.h b/velox/functions/sparksql/String.h index 6e935d2af79d..c3e4b67386fd 100644 --- a/velox/functions/sparksql/String.h +++ b/velox/functions/sparksql/String.h @@ -81,18 +81,29 @@ struct AsciiFunction { } }; +/// chr function +/// chr(n) -> string +/// Returns the Unicode code point ``n`` as a single character string. +/// If ``n < 0``, the result is an empty string. +/// If ``n >= 256``, the result is equivalent to chr(``n % 256``). template struct ChrFunction { VELOX_DEFINE_FUNCTION_TYPES(T); - FOLLY_ALWAYS_INLINE bool call(out_type& result, int64_t ord) { - if (ord < 0) { + FOLLY_ALWAYS_INLINE void call(out_type& result, int64_t n) { + if (n < 0) { result.resize(0); } else { - result.resize(1); - *result.data() = ord; + n = n & 0xFF; + if (n < 0x80) { + result.resize(1); + result.data()[0] = n; + } else { + result.resize(2); + result.data()[0] = 0xC0 + (n >> 6); + result.data()[1] = 0x80 + (n & 0x3F); + } } - return true; } }; diff --git a/velox/functions/sparksql/tests/StringTest.cpp b/velox/functions/sparksql/tests/StringTest.cpp index 71e3ab8e4586..9fc038bc5bfa 100644 --- a/velox/functions/sparksql/tests/StringTest.cpp +++ b/velox/functions/sparksql/tests/StringTest.cpp @@ -201,11 +201,16 @@ TEST_F(StringTest, Ascii) { } TEST_F(StringTest, Chr) { - EXPECT_EQ(chr(0), std::string("\0", 1)); - EXPECT_EQ(chr(32), " "); EXPECT_EQ(chr(-16), ""); - EXPECT_EQ(chr(256), std::string("\0", 1)); - EXPECT_EQ(chr(256 + 32), std::string(" ", 1)); + EXPECT_EQ(chr(0), std::string("\0", 1)); + EXPECT_EQ(chr(0x100), std::string("\0", 1)); + EXPECT_EQ(chr(0x1100), std::string("\0", 1)); + EXPECT_EQ(chr(0x20), "\x20"); + EXPECT_EQ(chr(0x100 + 0x20), "\x20"); + EXPECT_EQ(chr(0x80), "\xC2\x80"); + EXPECT_EQ(chr(0x100 + 0x80), "\xC2\x80"); + EXPECT_EQ(chr(0xFF), "\xC3\xBF"); + EXPECT_EQ(chr(0x100 + 0xFF), "\xC3\xBF"); EXPECT_EQ(chr(std::nullopt), std::nullopt); } diff --git a/velox/row/CMakeLists.txt b/velox/row/CMakeLists.txt index 2f57248047d5..03dff1030636 100644 --- a/velox/row/CMakeLists.txt +++ b/velox/row/CMakeLists.txt @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_library(velox_row_fast UnsafeRowFast.cpp) +add_library(velox_row_fast UnsafeRowFast.cpp CompactRow.cpp) target_link_libraries(velox_row_fast velox_vector) diff --git a/velox/row/CompactRow.cpp b/velox/row/CompactRow.cpp new file mode 100644 index 000000000000..04196096a18c --- /dev/null +++ b/velox/row/CompactRow.cpp @@ -0,0 +1,961 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/row/CompactRow.h" +#include "velox/vector/FlatVector.h" + +namespace facebook::velox::row { + +CompactRow::CompactRow(const RowVectorPtr& vector) + : typeKind_{vector->typeKind()}, decoded_{*vector} { + initialize(vector->type()); +} + +CompactRow::CompactRow(const VectorPtr& vector) + : typeKind_{vector->typeKind()}, decoded_{*vector} { + initialize(vector->type()); +} + +void CompactRow::initialize(const TypePtr& type) { + auto base = decoded_.base(); + switch (typeKind_) { + case TypeKind::ARRAY: { + auto arrayBase = base->as(); + children_.push_back(CompactRow(arrayBase->elements())); + childIsFixedWidth_.push_back( + arrayBase->elements()->type()->isFixedWidth()); + break; + } + case TypeKind::MAP: { + auto mapBase = base->as(); + children_.push_back(CompactRow(mapBase->mapKeys())); + children_.push_back(CompactRow(mapBase->mapValues())); + childIsFixedWidth_.push_back(mapBase->mapKeys()->type()->isFixedWidth()); + childIsFixedWidth_.push_back( + mapBase->mapValues()->type()->isFixedWidth()); + break; + } + case TypeKind::ROW: { + auto rowBase = base->as(); + for (const auto& child : rowBase->children()) { + children_.push_back(CompactRow(child)); + childIsFixedWidth_.push_back(child->type()->isFixedWidth()); + } + + rowNullBytes_ = bits::nbytes(type->size()); + break; + } + case TypeKind::BOOLEAN: + valueBytes_ = 1; + fixedWidthTypeKind_ = true; + break; + case TypeKind::TINYINT: + FOLLY_FALLTHROUGH; + case TypeKind::SMALLINT: + FOLLY_FALLTHROUGH; + case TypeKind::INTEGER: + FOLLY_FALLTHROUGH; + case TypeKind::BIGINT: + FOLLY_FALLTHROUGH; + case TypeKind::HUGEINT: + FOLLY_FALLTHROUGH; + case TypeKind::REAL: + FOLLY_FALLTHROUGH; + case TypeKind::DOUBLE: + valueBytes_ = type->cppSizeInBytes(); + fixedWidthTypeKind_ = true; + supportsBulkCopy_ = decoded_.isIdentityMapping(); + break; + case TypeKind::TIMESTAMP: + valueBytes_ = sizeof(int64_t); + fixedWidthTypeKind_ = true; + break; + case TypeKind::VARCHAR: + FOLLY_FALLTHROUGH; + case TypeKind::VARBINARY: + // Nothing to do. + break; + case TypeKind::UNKNOWN: + // UNKNOWN values are always nulls, hence, do not take up space. + valueBytes_ = 0; + fixedWidthTypeKind_ = true; + supportsBulkCopy_ = true; + break; + default: + VELOX_UNSUPPORTED("Unsupported type: {}", type->toString()); + } +} + +// static +std::optional CompactRow::fixedRowSize(const RowTypePtr& rowType) { + const size_t numFields = rowType->size(); + const size_t nullLength = bits::nbytes(numFields); + + size_t size = nullLength; + for (const auto& child : rowType->children()) { + if (child->isTimestamp()) { + size += sizeof(int64_t); + } else if (child->isFixedWidth()) { + size += child->cppSizeInBytes(); + } else { + return std::nullopt; + } + } + + return size; +} + +int32_t CompactRow::rowSize(vector_size_t index) { + return rowRowSize(index); +} + +int32_t CompactRow::rowRowSize(vector_size_t index) { + auto childIndex = decoded_.index(index); + + const auto numFields = children_.size(); + int32_t size = rowNullBytes_; + + for (auto i = 0; i < numFields; ++i) { + if (childIsFixedWidth_[i]) { + size += children_[i].valueBytes_; + } else if (!children_[i].isNullAt(childIndex)) { + size += children_[i].variableWidthRowSize(childIndex); + } + } + + return size; +} + +int32_t CompactRow::serializeRow(vector_size_t index, char* buffer) { + auto childIndex = decoded_.index(index); + + int64_t valuesOffset = rowNullBytes_; + + auto* nulls = reinterpret_cast(buffer); + + for (auto i = 0; i < children_.size(); ++i) { + auto& child = children_[i]; + + // Write null bit. Advance offset if 'fixed-width'. + if (child.isNullAt(childIndex)) { + bits::setBit(nulls, i, true); + if (childIsFixedWidth_[i]) { + valuesOffset += child.valueBytes_; + } + continue; + } + + if (childIsFixedWidth_[i]) { + // Write fixed-width value. + if (child.valueBytes_ > 0) { + child.serializeFixedWidth(childIndex, buffer + valuesOffset); + } + valuesOffset += child.valueBytes_; + } else { + // Write non-null variable-width value. + auto size = + child.serializeVariableWidth(childIndex, buffer + valuesOffset); + valuesOffset += size; + } + } + + return valuesOffset; +} + +bool CompactRow::isNullAt(vector_size_t index) { + return decoded_.isNullAt(index); +} + +int32_t CompactRow::variableWidthRowSize(vector_size_t index) { + switch (typeKind_) { + case TypeKind::VARCHAR: + FOLLY_FALLTHROUGH; + case TypeKind::VARBINARY: { + auto value = decoded_.valueAt(index); + return sizeof(int32_t) + value.size(); + } + case TypeKind::ARRAY: + return arrayRowSize(index); + case TypeKind::MAP: + return mapRowSize(index); + case TypeKind::ROW: + return rowRowSize(index); + default: + VELOX_UNREACHABLE( + "Unexpected type kind: {}", mapTypeKindToName(typeKind_)); + }; +} + +int32_t CompactRow::arrayRowSize(vector_size_t index) { + auto baseIndex = decoded_.index(index); + + auto arrayBase = decoded_.base()->asUnchecked(); + auto offset = arrayBase->offsetAt(baseIndex); + auto size = arrayBase->sizeAt(baseIndex); + + return arrayRowSize(children_[0], offset, size, childIsFixedWidth_[0]); +} + +int32_t CompactRow::arrayRowSize( + CompactRow& elements, + vector_size_t offset, + vector_size_t size, + bool fixedWidth) { + const int32_t nullBytes = bits::nbytes(size); + + // array size | null bits | elements + + // 4 bytes for number of elements, some bytes for null flags. + int32_t rowSize = sizeof(int32_t) + nullBytes; + if (fixedWidth) { + return rowSize + size * elements.valueBytes(); + } + + if (size == 0) { + return rowSize; + } + + // If element type is a complex type, then add 4 bytes for overall serialized + // size of the array + 4 bytes per element for offset of the serialized + // element. + // size | nulls | serialized size | serialized offset 1 | serialized offset 2 + // |...| element 1 | element 2 |... + + if (!(elements.typeKind_ == TypeKind::VARCHAR || + elements.typeKind_ == TypeKind::VARBINARY)) { + // 4 bytes for the overall serialized size + 4 bytes for the offset of each + // element. + rowSize += sizeof(int32_t) + size * sizeof(int32_t); + } + + for (auto i = 0; i < size; ++i) { + if (!elements.isNullAt(offset + i)) { + rowSize += elements.variableWidthRowSize(offset + i); + } + } + + return rowSize; +} + +int32_t CompactRow::serializeArray(vector_size_t index, char* buffer) { + auto baseIndex = decoded_.index(index); + + // For complex-type elements: + // array size | null bits | serialized size | offset e1 | offset e2 |... | e1 + // | e2 |... + // + // 'serialized size' is the number of bytes starting after null bits and to + // the end of the array. Offsets are specified relative to position right + // after 'serialized size'. + // + // For fixed-width or string element type: + // array size | null bite | e1 | e2 |... + + auto arrayBase = decoded_.base()->asUnchecked(); + auto offset = arrayBase->offsetAt(baseIndex); + auto size = arrayBase->sizeAt(baseIndex); + + return serializeAsArray( + children_[0], offset, size, childIsFixedWidth_[0], buffer); +} + +namespace { + +constexpr size_t kSizeBytes = sizeof(int32_t); + +void writeInt32(char* buffer, int32_t n) { + memcpy(buffer, &n, sizeof(int32_t)); +} + +int32_t readInt32(const char* buffer) { + int32_t n; + memcpy(&n, buffer, sizeof(int32_t)); + return n; +} +} // namespace + +int32_t CompactRow::serializeAsArray( + CompactRow& elements, + vector_size_t offset, + vector_size_t size, + bool fixedWidth, + char* buffer) { + // For complex-type elements: + // array size | null bits | serialized size | offset e1 | offset e2 |... | e1 + // | e2 |... + // + // For fixed-width and string element types: + // array size | null bits | e1 | e2 |... + + // Write array size. + writeInt32(buffer, size); + + // Write null flags. + const int32_t nullBytes = bits::nbytes(size); + const int32_t nullsOffset = kSizeBytes; + + int32_t elementsOffset = nullsOffset + nullBytes; + + auto* rawNulls = reinterpret_cast(buffer + nullsOffset); + + if (elements.supportsBulkCopy_) { + if (elements.decoded_.mayHaveNulls()) { + for (auto i = 0; i < size; ++i) { + if (elements.isNullAt(offset + i)) { + bits::setBit(rawNulls, i, true); + } + } + } + elements.serializeFixedWidth(offset, size, buffer + elementsOffset); + return elementsOffset + size * elements.valueBytes_; + } + + if (fixedWidth) { + for (auto i = 0; i < size; ++i) { + if (elements.isNullAt(offset + i)) { + bits::setBit(rawNulls, i, true); + } else { + elements.serializeFixedWidth(offset + i, buffer + elementsOffset); + } + elementsOffset += elements.valueBytes_; + } + } else if ( + elements.typeKind_ == TypeKind::VARCHAR || + elements.typeKind_ == TypeKind::VARBINARY) { + for (auto i = 0; i < size; ++i) { + if (elements.isNullAt(offset + i)) { + bits::setBit(rawNulls, i, true); + } else { + auto serializedBytes = elements.serializeVariableWidth( + offset + i, buffer + elementsOffset); + elementsOffset += serializedBytes; + } + } + } else { + if (size > 0) { + // Leave room for serialized size and offsets. + const size_t baseOffset = elementsOffset + kSizeBytes; + elementsOffset += kSizeBytes + size * kSizeBytes; + + for (auto i = 0; i < size; ++i) { + if (elements.isNullAt(offset + i)) { + bits::setBit(rawNulls, i, true); + } else { + writeInt32( + buffer + baseOffset + i * kSizeBytes, + elementsOffset - baseOffset); + + auto serializedBytes = elements.serializeVariableWidth( + offset + i, buffer + elementsOffset); + + elementsOffset += serializedBytes; + } + } + + writeInt32(buffer + baseOffset - kSizeBytes, elementsOffset - baseOffset); + } + } + + return elementsOffset; +} + +int32_t CompactRow::mapRowSize(vector_size_t index) { + auto baseIndex = decoded_.index(index); + + // | + + auto mapBase = decoded_.base()->asUnchecked(); + auto offset = mapBase->offsetAt(baseIndex); + auto size = mapBase->sizeAt(baseIndex); + + return arrayRowSize(children_[0], offset, size, childIsFixedWidth_[0]) + + arrayRowSize(children_[1], offset, size, childIsFixedWidth_[1]); +} + +int32_t CompactRow::serializeMap(vector_size_t index, char* buffer) { + auto baseIndex = decoded_.index(index); + + // | + + auto mapBase = decoded_.base()->asUnchecked(); + auto offset = mapBase->offsetAt(baseIndex); + auto size = mapBase->sizeAt(baseIndex); + + auto keysSerializedBytes = serializeAsArray( + children_[0], offset, size, childIsFixedWidth_[0], buffer); + + auto valuesSerializedBytes = serializeAsArray( + children_[1], + offset, + size, + childIsFixedWidth_[1], + buffer + keysSerializedBytes); + + return keysSerializedBytes + valuesSerializedBytes; +} + +int32_t CompactRow::serialize(vector_size_t index, char* buffer) { + return serializeRow(index, buffer); +} + +void CompactRow::serializeFixedWidth(vector_size_t index, char* buffer) { + VELOX_DCHECK(fixedWidthTypeKind_); + switch (typeKind_) { + case TypeKind::BOOLEAN: + *reinterpret_cast(buffer) = decoded_.valueAt(index); + break; + case TypeKind::TIMESTAMP: { + auto micros = decoded_.valueAt(index).toMicros(); + memcpy(buffer, µs, sizeof(int64_t)); + break; + } + default: + memcpy( + buffer, + decoded_.data() + decoded_.index(index) * valueBytes_, + valueBytes_); + } +} + +void CompactRow::serializeFixedWidth( + vector_size_t offset, + vector_size_t size, + char* buffer) { + VELOX_DCHECK(supportsBulkCopy_); + // decoded_.data() can be null if all values are null. + if (decoded_.data()) { + memcpy( + buffer, + decoded_.data() + decoded_.index(offset) * valueBytes_, + valueBytes_ * size); + } +} + +int32_t CompactRow::serializeVariableWidth(vector_size_t index, char* buffer) { + switch (typeKind_) { + case TypeKind::VARCHAR: + FOLLY_FALLTHROUGH; + case TypeKind::VARBINARY: { + auto value = decoded_.valueAt(index); + writeInt32(buffer, value.size()); + if (!value.empty()) { + memcpy(buffer + kSizeBytes, value.data(), value.size()); + } + return kSizeBytes + value.size(); + } + case TypeKind::ARRAY: + return serializeArray(index, buffer); + case TypeKind::MAP: + return serializeMap(index, buffer); + case TypeKind::ROW: + return serializeRow(index, buffer); + default: + VELOX_UNREACHABLE( + "Unexpected type kind: {}", mapTypeKindToName(typeKind_)); + }; +} + +namespace { + +// Reads single fixed-width value from buffer and returns number of bytes read. +// Stores the value into flatVector[index]. +template +size_t readFixedWidthValue( + bool isNull, + const char* buffer, + FlatVector* flatVector, + vector_size_t index) { + if (isNull) { + flatVector->setNull(index, true); + } else if constexpr (std::is_same_v) { + int64_t micros; + memcpy(µs, buffer, sizeof(int64_t)); + flatVector->set(index, Timestamp::fromMicros(micros)); + } else { + T value; + memcpy(&value, buffer, sizeof(T)); + flatVector->set(index, value); + } + + if constexpr (std::is_same_v) { + return sizeof(int64_t); + } else { + return sizeof(T); + } +} + +// Deserializes one fixed-width value from each 'row' in 'data'. +// Each value starts at data[row].data() + offsets[row]. +// +// @param nulls Null flags for the values. +// @param offsets In/out parameter that specifies offsets in 'data' for the +// serialized values. Advances past the serialized value. +template +VectorPtr deserializeFixedWidth( + const TypePtr& type, + const std::vector& data, + const BufferPtr& nulls, + std::vector& offsets, + memory::MemoryPool* pool) { + using T = typename TypeTraits::NativeType; + + const auto numRows = data.size(); + auto flatVector = BaseVector::create>(type, numRows, pool); + + auto* rawNulls = nulls->as(); + + for (auto i = 0; i < numRows; ++i) { + offsets[i] += readFixedWidthValue( + bits::isBitNull(rawNulls, i), + data[i].data() + offsets[i], + flatVector.get(), + i); + } + + return flatVector; +} + +vector_size_t totalSize(const vector_size_t* rawSizes, size_t numRows) { + vector_size_t total = 0; + for (auto i = 0; i < numRows; ++i) { + total += rawSizes[i]; + } + return total; +} + +const uint8_t* readNulls(const char* buffer) { + return reinterpret_cast(buffer); +} + +// Deserializes multiple fixed-width values from each 'row' in 'data'. +// Each set of values starts at data[row].data() + offsets[row] and contains +// null flags followed by values. The number of values is provided in +// sizes[row]. +// nulls | v1 | v2 | v3 |... +// Advances offsets past the last value. +template +VectorPtr deserializeFixedWidthArrays( + const TypePtr& type, + const std::vector& data, + const BufferPtr& sizes, + std::vector& offsets, + memory::MemoryPool* pool) { + using T = typename TypeTraits::NativeType; + + const auto numRows = data.size(); + auto* rawSizes = sizes->as(); + + const auto total = totalSize(rawSizes, numRows); + + auto flatVector = BaseVector::create>(type, total, pool); + + vector_size_t index = 0; + for (auto i = 0; i < numRows; ++i) { + const auto size = rawSizes[i]; + if (size > 0) { + auto nullBytes = bits::nbytes(size); + + auto* rawElementNulls = readNulls(data[i].data() + offsets[i]); + + offsets[i] += nullBytes; + + for (auto j = 0; j < size; ++j) { + offsets[i] += readFixedWidthValue( + bits::isBitSet(rawElementNulls, j), + data[i].data() + offsets[i], + flatVector.get(), + index); + ++index; + } + } + } + + return flatVector; +} + +int32_t readString( + const char* buffer, + FlatVector* flatVector, + vector_size_t index) { + int32_t size = readInt32(buffer); + StringView value(buffer + kSizeBytes, size); + flatVector->set(index, value); + return kSizeBytes + size; +} + +VectorPtr deserializeUnknowns( + const TypePtr& type, + const std::vector& data, + const BufferPtr& nulls, + std::vector& offsets, + memory::MemoryPool* pool) { + return BaseVector::createNullConstant(UNKNOWN(), data.size(), pool); +} + +// Deserializes one string from each 'row' in 'data'. +// Each strings starts at data[row].data() + offsets[row]. +// string size | +// Advances the offsets past the strings. +VectorPtr deserializeStrings( + const TypePtr& type, + const std::vector& data, + const BufferPtr& nulls, + std::vector& offsets, + memory::MemoryPool* pool) { + const auto numRows = data.size(); + auto flatVector = + BaseVector::create>(type, numRows, pool); + + auto* rawNulls = nulls->as(); + + for (auto i = 0; i < numRows; ++i) { + if (bits::isBitNull(rawNulls, i)) { + flatVector->setNull(i, true); + } else { + offsets[i] += + readString(data[i].data() + offsets[i], flatVector.get(), i); + } + } + + return flatVector; +} + +VectorPtr deserializeUnknownArrays( + const TypePtr& type, + const std::vector& data, + const BufferPtr& sizes, + std::vector& offsets, + memory::MemoryPool* pool) { + const auto numRows = data.size(); + auto* rawSizes = sizes->as(); + const auto total = totalSize(rawSizes, numRows); + + return BaseVector::createNullConstant(UNKNOWN(), total, pool); +} + +// Deserializes multiple strings from each 'row' in 'data'. +// Each set of strings starts at data[row].data() + offsets[row] and contains +// null flags followed by the strings. The number of strings is provided in +// sizes[row]. +// nulls | size-of-s1 | | size-of-s2 | |... +// Advances offsets past the last string. +VectorPtr deserializeStringArrays( + const TypePtr& type, + const std::vector& data, + const BufferPtr& sizes, + std::vector& offsets, + memory::MemoryPool* pool) { + const auto numRows = data.size(); + auto* rawSizes = sizes->as(); + + const auto total = totalSize(rawSizes, numRows); + + auto flatVector = + BaseVector::create>(type, total, pool); + + vector_size_t index = 0; + for (auto i = 0; i < numRows; ++i) { + const auto size = rawSizes[i]; + if (size > 0) { + auto nullBytes = bits::nbytes(size); + + auto* rawElementNulls = readNulls(data[i].data() + offsets[i]); + + offsets[i] += nullBytes; + + for (auto j = 0; j < size; ++j) { + if (bits::isBitSet(rawElementNulls, j)) { + flatVector->setNull(index++, true); + } else { + offsets[i] += + readString(data[i].data() + offsets[i], flatVector.get(), index); + ++index; + } + } + } + } + + return flatVector; +} + +VectorPtr deserialize( + const TypePtr& type, + const std::vector& data, + const BufferPtr& nulls, + std::vector& offsets, + memory::MemoryPool* pool); + +// Deserializes multiple arrays from each 'row' in 'data'. +// Each set of arrays starts at data[row].data() + offsets[row] and contains +// null flags followed by the arrays. The number of arrays is provided in +// sizes[row]. +// nulls | serializes size | offset-of-a1 | offset-of-a2 |... +// |size-of-a1 | nulls-of-a1-elements | |... +// +// Advances offsets past the last array. +VectorPtr deserializeComplexArrays( + const TypePtr& type, + const std::vector& data, + const BufferPtr& sizes, + std::vector& offsets, + memory::MemoryPool* pool) { + const auto numRows = data.size(); + auto* rawSizes = sizes->as(); + + const auto total = totalSize(rawSizes, numRows); + + BufferPtr nulls = allocateNulls(total, pool); + auto* rawNulls = nulls->asMutable(); + + std::vector nestedData; + nestedData.reserve(total); + std::vector nestedOffsets; + nestedOffsets.reserve(total); + + vector_size_t nestedIndex = 0; + for (auto i = 0; i < numRows; ++i) { + const auto size = rawSizes[i]; + if (size > 0) { + // Read nulls. + auto* rawElementNulls = readNulls(data[i].data() + offsets[i]); + offsets[i] += bits::nbytes(size); + + // Read serialized size. + auto serializedSize = readInt32(data[i].data() + offsets[i]); + offsets[i] += kSizeBytes; + + // Read offsets of individual elements. + auto buffer = data[i].data() + offsets[i]; + for (auto j = 0; j < size; ++j) { + if (bits::isBitSet(rawElementNulls, j)) { + bits::setNull(rawNulls, nestedIndex++); + } else { + int32_t nestedOffset = readInt32(buffer + j * kSizeBytes); + nestedOffsets.push_back(offsets[i] + nestedOffset); + nestedData.push_back(data[i]); + ++nestedIndex; + } + } + + offsets[i] += serializedSize; + } + } + + return deserialize(type, nestedData, nulls, nestedOffsets, pool); +} + +// Deserializes one array from each 'row' in 'data'. +// Each array starts at data[row].data() + offsets[row]. +// size | element nulls | serialized size (if complex type elements) +// | element offsets (if complex type elements) | e1 | e2 | e3 |... +// +// Advances the offsets past the arrays. +ArrayVectorPtr deserializeArrays( + const TypePtr& type, + const std::vector& data, + const BufferPtr& nulls, + std::vector& offsets, + memory::MemoryPool* pool) { + const auto numRows = data.size(); + + auto* rawNulls = nulls->as(); + + BufferPtr arrayOffsets = allocateOffsets(numRows, pool); + auto* rawArrayOffsets = arrayOffsets->asMutable(); + + BufferPtr arraySizes = allocateSizes(numRows, pool); + auto* rawArraySizes = arraySizes->asMutable(); + + vector_size_t arrayOffset = 0; + + for (auto i = 0; i < numRows; ++i) { + if (!bits::isBitNull(rawNulls, i)) { + // Read array size. + int32_t size = readInt32(data[i].data() + offsets[i]); + offsets[i] += kSizeBytes; + + rawArrayOffsets[i] = arrayOffset; + rawArraySizes[i] = size; + arrayOffset += size; + } + } + + VectorPtr elements; + const auto& elementType = type->childAt(0); + if (elementType->isUnKnown()) { + elements = + deserializeUnknownArrays(elementType, data, arraySizes, offsets, pool); + } else if (elementType->isFixedWidth()) { + elements = VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH( + deserializeFixedWidthArrays, + elementType->kind(), + elementType, + data, + arraySizes, + offsets, + pool); + } else { + switch (elementType->kind()) { + case TypeKind::VARCHAR: + case TypeKind::VARBINARY: + elements = deserializeStringArrays( + elementType, data, arraySizes, offsets, pool); + break; + case TypeKind::ARRAY: + case TypeKind::MAP: + case TypeKind::ROW: + elements = deserializeComplexArrays( + elementType, data, arraySizes, offsets, pool); + break; + default: + VELOX_UNREACHABLE("{}", elementType->toString()); + } + } + + return std::make_shared( + pool, type, nulls, numRows, arrayOffsets, arraySizes, elements); +} + +// Deserializes one map from each 'row' in 'data'. +// Each map starts at data[row].data() + offsets[row]. +// array-of-keys | array-of-values +// Advances the offsets past the maps. +VectorPtr deserializeMaps( + const TypePtr& type, + const std::vector& data, + const BufferPtr& nulls, + std::vector& offsets, + memory::MemoryPool* pool) { + auto arrayOfKeysType = ARRAY(type->childAt(0)); + auto arrayOfValuesType = ARRAY(type->childAt(1)); + auto arrayOfKeys = + deserializeArrays(arrayOfKeysType, data, nulls, offsets, pool); + auto arrayOfValues = + deserializeArrays(arrayOfValuesType, data, nulls, offsets, pool); + + return std::make_shared( + pool, + type, + nulls, + data.size(), + arrayOfKeys->offsets(), + arrayOfKeys->sizes(), + arrayOfKeys->elements(), + arrayOfValues->elements()); +} + +RowVectorPtr deserializeRows( + const TypePtr& type, + const std::vector& data, + const BufferPtr& nulls, + std::vector& offsets, + memory::MemoryPool* pool); + +// Switches on 'type' and calls type-specific deserialize method to deserialize +// one value from each 'row' in 'data' starting at the specified offset. +// Each value starts at data[row].data() + offsets[row]. +VectorPtr deserialize( + const TypePtr& type, + const std::vector& data, + const BufferPtr& nulls, + std::vector& offsets, + memory::MemoryPool* pool) { + const auto typeKind = type->kind(); + + if (typeKind == TypeKind::UNKNOWN) { + return deserializeUnknowns(type, data, nulls, offsets, pool); + } + + if (type->isFixedWidth()) { + return VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH( + deserializeFixedWidth, typeKind, type, data, nulls, offsets, pool); + } + switch (typeKind) { + case TypeKind::VARCHAR: + case TypeKind::VARBINARY: + return deserializeStrings(type, data, nulls, offsets, pool); + break; + case TypeKind::ARRAY: + return deserializeArrays(type, data, nulls, offsets, pool); + break; + case TypeKind::MAP: + return deserializeMaps(type, data, nulls, offsets, pool); + break; + case TypeKind::ROW: + return deserializeRows(type, data, nulls, offsets, pool); + break; + default: + VELOX_UNREACHABLE("{}", type->toString()); + } +} + +// Deserializes one struct from each 'row' in 'data'. +// nulls | field1 | field2 |... +RowVectorPtr deserializeRows( + const TypePtr& type, + const std::vector& data, + const BufferPtr& nulls, + std::vector& offsets, + memory::MemoryPool* pool) { + const auto numRows = data.size(); + const size_t numFields = type->size(); + + std::vector fields; + + auto* rawNulls = nulls != nullptr ? nulls->as() : nullptr; + + std::vector fieldNulls; + fieldNulls.reserve(numFields); + for (auto i = 0; i < numFields; ++i) { + fieldNulls.emplace_back(allocateNulls(numRows, pool)); + auto* rawFieldNulls = fieldNulls.back()->asMutable(); + for (auto row = 0; row < numRows; ++row) { + auto* serializedNulls = readNulls(data[row].data() + offsets[row]); + const auto isNull = + (rawNulls != nullptr && bits::isBitNull(rawNulls, row)) || + bits::isBitSet(serializedNulls, i); + bits::setBit(rawFieldNulls, row, !isNull); + } + } + + const size_t nullLength = bits::nbytes(numFields); + for (auto row = 0; row < numRows; ++row) { + offsets[row] += nullLength; + } + + for (auto i = 0; i < numFields; ++i) { + auto field = + deserialize(type->childAt(i), data, fieldNulls[i], offsets, pool); + fields.emplace_back(std::move(field)); + } + + return std::make_shared( + pool, type, nulls, numRows, std::move(fields)); +} + +} // namespace + +// static +RowVectorPtr CompactRow::deserialize( + const std::vector& data, + const RowTypePtr& rowType, + memory::MemoryPool* pool) { + const auto numRows = data.size(); + std::vector offsets(numRows, 0); + + return deserializeRows(rowType, data, nullptr, offsets, pool); +} + +} // namespace facebook::velox::row diff --git a/velox/row/CompactRow.h b/velox/row/CompactRow.h new file mode 100644 index 000000000000..9abaed0bdee2 --- /dev/null +++ b/velox/row/CompactRow.h @@ -0,0 +1,131 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "velox/vector/ComplexVector.h" +#include "velox/vector/DecodedVector.h" + +namespace facebook::velox::row { + +class CompactRow { + public: + explicit CompactRow(const RowVectorPtr& vector); + + /// Returns row size if all fields are fixed width. Return std::nullopt if + /// there are variable-width fields. + static std::optional fixedRowSize(const RowTypePtr& rowType); + + /// Returns serialized size of the row at specified index. Use only if + /// 'fixedRowSize' returned std::nullopt. + int32_t rowSize(vector_size_t index); + + /// Serializes row at specified index into 'buffer'. + /// 'buffer' must have sufficient capacity and set to all zeros. + int32_t serialize(vector_size_t index, char* buffer); + + /// Deserializes multiple rows into a RowVector of specified type. The type + /// must match the contents of the serialized rows. + static RowVectorPtr deserialize( + const std::vector& data, + const RowTypePtr& rowType, + memory::MemoryPool* pool); + + private: + explicit CompactRow(const VectorPtr& vector); + + void initialize(const TypePtr& type); + + bool isNullAt(vector_size_t); + + /// Fixed-width types only. Returns number of bytes used by single value. + int32_t valueBytes() const { + return valueBytes_; + } + + /// Writes fixed-width value at specified index into 'buffer'. Value must not + /// be null. + void serializeFixedWidth(vector_size_t index, char* buffer); + + /// Writes range of fixed-width values between 'offset' and 'offset + size' + /// into 'buffer'. Values can be null. + void + serializeFixedWidth(vector_size_t offset, vector_size_t size, char* buffer); + + /// Returns serialized size of variable-width row. + int32_t variableWidthRowSize(vector_size_t index); + + /// Writes variable-width value at specified index into 'buffer'. Value must + /// not be null. Returns number of bytes written to 'buffer'. + int32_t serializeVariableWidth(vector_size_t index, char* buffer); + + private: + /// Returns serialized size of array row. + int32_t arrayRowSize(vector_size_t index); + + /// Serializes array value to buffer. Value must not be null. Returns number + /// of bytes written to 'buffer'. + int32_t serializeArray(vector_size_t index, char* buffer); + + /// Returns serialized size of map row. + int32_t mapRowSize(vector_size_t index); + + /// Serializes map value to buffer. Value must not be null. Returns number of + /// bytes written to 'buffer'. + int32_t serializeMap(vector_size_t index, char* buffer); + + /// Returns serialized size of a range of values. + int32_t arrayRowSize( + CompactRow& elements, + vector_size_t offset, + vector_size_t size, + bool fixedWidth); + + /// Serializes a range of values into buffer. Returns number of bytes written + /// to 'buffer'. + int32_t serializeAsArray( + CompactRow& elements, + vector_size_t offset, + vector_size_t size, + bool fixedWidth, + char* buffer); + + /// Returns serialized size of struct value. + int32_t rowRowSize(vector_size_t index); + + /// Serializes struct value to buffer. Value must not be null. + int32_t serializeRow(vector_size_t index, char* buffer); + + const TypeKind typeKind_; + DecodedVector decoded_; + + /// True if values of 'typeKind_' have fixed width. + bool fixedWidthTypeKind_{false}; + + /// ARRAY, MAP and ROW types only. + std::vector children_; + std::vector childIsFixedWidth_; + + /// True if this is a flat fixed-width vector whose consecutive values can be + /// copied into serialized buffer in bulk. + bool supportsBulkCopy_{false}; + + // ROW type only. Number of bytes used by null flags. + size_t rowNullBytes_; + + // Fixed-width types only. Number of bytes used for a single value. + size_t valueBytes_; +}; +} // namespace facebook::velox::row diff --git a/velox/row/benchmark/UnsafeRowSerializeBenchmark.cpp b/velox/row/benchmark/UnsafeRowSerializeBenchmark.cpp index a5f096efa6cd..f751f201d93b 100644 --- a/velox/row/benchmark/UnsafeRowSerializeBenchmark.cpp +++ b/velox/row/benchmark/UnsafeRowSerializeBenchmark.cpp @@ -18,6 +18,8 @@ #include "velox/common/memory/HashStringAllocator.h" #include "velox/exec/ContainerRowSerde.h" +#include "velox/row/CompactRow.h" +#include "velox/row/UnsafeRowDeserializers.h" #include "velox/row/UnsafeRowFast.h" #include "velox/vector/fuzzer/VectorFuzzer.h" @@ -26,6 +28,87 @@ namespace { class SerializeBenchmark { public: + void serializeUnsafe(const RowTypePtr& rowType) { + folly::BenchmarkSuspender suspender; + auto data = makeData(rowType); + suspender.dismiss(); + + UnsafeRowFast fast(data); + auto totalSize = computeTotalSize(fast, rowType, data->size()); + auto buffer = AlignedBuffer::allocate(totalSize, pool()); + auto serialized = serialize(fast, data->size(), buffer); + VELOX_CHECK_EQ(serialized.size(), data->size()); + } + + void deserializeUnsafe(const RowTypePtr& rowType) { + folly::BenchmarkSuspender suspender; + auto data = makeData(rowType); + UnsafeRowFast fast(data); + auto totalSize = computeTotalSize(fast, rowType, data->size()); + auto buffer = AlignedBuffer::allocate(totalSize, pool()); + auto serialized = serialize(fast, data->size(), buffer); + suspender.dismiss(); + + auto copy = UnsafeRowDeserializer::deserialize(serialized, rowType, pool()); + VELOX_CHECK_EQ(copy->size(), data->size()); + } + + void serializeCompact(const RowTypePtr& rowType) { + folly::BenchmarkSuspender suspender; + auto data = makeData(rowType); + suspender.dismiss(); + + CompactRow compact(data); + auto totalSize = computeTotalSize(compact, rowType, data->size()); + auto buffer = AlignedBuffer::allocate(totalSize, pool()); + auto serialized = serialize(compact, data->size(), buffer); + VELOX_CHECK_EQ(serialized.size(), data->size()); + } + + void deserializeCompact(const RowTypePtr& rowType) { + folly::BenchmarkSuspender suspender; + auto data = makeData(rowType); + CompactRow compact(data); + auto totalSize = computeTotalSize(compact, rowType, data->size()); + auto buffer = AlignedBuffer::allocate(totalSize, pool()); + auto serialized = serialize(compact, data->size(), buffer); + suspender.dismiss(); + + auto copy = CompactRow::deserialize(serialized, rowType, pool()); + VELOX_CHECK_EQ(copy->size(), data->size()); + } + + void serializeContainer(const RowTypePtr& rowType) { + folly::BenchmarkSuspender suspender; + auto data = makeData(rowType); + suspender.dismiss(); + + HashStringAllocator allocator(pool()); + auto position = serialize(data, allocator); + VELOX_CHECK_NOT_NULL(position.header); + } + + void deserializeContainer(const RowTypePtr& rowType) { + folly::BenchmarkSuspender suspender; + auto data = makeData(rowType); + + HashStringAllocator allocator(pool()); + auto position = serialize(data, allocator); + VELOX_CHECK_NOT_NULL(position.header); + suspender.dismiss(); + + auto copy = BaseVector::create(rowType, data->size(), pool()); + + ByteStream in; + HashStringAllocator::prepareRead(position.header, in); + for (auto i = 0; i < data->size(); ++i) { + exec::ContainerRowSerde::deserialize(in, i, copy.get()); + } + + VELOX_CHECK_EQ(copy->size(), data->size()); + } + + private: RowVectorPtr makeData(const RowTypePtr& rowType) { VectorFuzzer::Options options; options.vectorSize = 1'000; @@ -36,52 +119,84 @@ class SerializeBenchmark { return fuzzer.fuzzInputRow(rowType); } - void runUnsafe(const RowTypePtr& rowType) { - folly::BenchmarkSuspender suspender; - auto data = makeData(rowType); - suspender.dismiss(); - - UnsafeRowFast fast(data); - + size_t computeTotalSize( + UnsafeRowFast& unsafeRow, + const RowTypePtr& rowType, + vector_size_t numRows) { size_t totalSize = 0; if (auto fixedRowSize = UnsafeRowFast::fixedRowSize(rowType)) { - totalSize += fixedRowSize.value() * data->size(); + totalSize += fixedRowSize.value() * numRows; } else { - for (auto i = 0; i < data->size(); ++i) { - auto rowSize = fast.rowSize(i); + for (auto i = 0; i < numRows; ++i) { + auto rowSize = unsafeRow.rowSize(i); totalSize += rowSize; } } + return totalSize; + } - auto buffer = AlignedBuffer::allocate(totalSize, pool()); + std::vector> serialize( + UnsafeRowFast& unsafeRow, + vector_size_t numRows, + BufferPtr& buffer) { + std::vector> serialized; auto rawBuffer = buffer->asMutable(); size_t offset = 0; - for (auto i = 0; i < data->size(); ++i) { - auto rowSize = fast.serialize(i, rawBuffer + offset); + for (auto i = 0; i < numRows; ++i) { + auto rowSize = unsafeRow.serialize(i, rawBuffer + offset); + serialized.push_back(std::string_view(rawBuffer + offset, rowSize)); offset += rowSize; } - VELOX_CHECK_EQ(totalSize, offset); + VELOX_CHECK_EQ(buffer->size(), offset); + return serialized; } - void runContainer(const RowTypePtr& rowType) { - folly::BenchmarkSuspender suspender; - auto data = makeData(rowType); - suspender.dismiss(); + size_t computeTotalSize( + CompactRow& compactRow, + const RowTypePtr& rowType, + vector_size_t numRows) { + size_t totalSize = 0; + if (auto fixedRowSize = CompactRow::fixedRowSize(rowType)) { + totalSize += fixedRowSize.value() * numRows; + } else { + for (auto i = 0; i < numRows; ++i) { + auto rowSize = compactRow.rowSize(i); + totalSize += rowSize; + } + } + return totalSize; + } - HashStringAllocator allocator(pool()); + std::vector + serialize(CompactRow& compactRow, vector_size_t numRows, BufferPtr& buffer) { + std::vector serialized; + auto rawBuffer = buffer->asMutable(); + + size_t offset = 0; + for (auto i = 0; i < numRows; ++i) { + auto rowSize = compactRow.serialize(i, rawBuffer + offset); + serialized.push_back(std::string_view(rawBuffer + offset, rowSize)); + offset += rowSize; + } + + VELOX_CHECK_EQ(buffer->size(), offset); + return serialized; + } + + HashStringAllocator::Position serialize( + const RowVectorPtr& data, + HashStringAllocator& allocator) { ByteStream out(&allocator); auto position = allocator.newWrite(out); for (auto i = 0; i < data->size(); ++i) { exec::ContainerRowSerde::serialize(*data, i, out); } allocator.finishWrite(out, 0); - - VELOX_CHECK_GT(out.size(), 0); + return position; } - private: memory::MemoryPool* pool() { return pool_.get(); } @@ -89,142 +204,86 @@ class SerializeBenchmark { std::shared_ptr pool_{memory::addDefaultLeafMemoryPool()}; }; -BENCHMARK(unsafe_fixedWidth5) { - SerializeBenchmark benchmark; - benchmark.runUnsafe(ROW({BIGINT(), DOUBLE(), BOOLEAN(), TINYINT(), REAL()})); -} - -BENCHMARK_RELATIVE(container_fixedWidth5) { - SerializeBenchmark benchmark; - benchmark.runContainer( - ROW({BIGINT(), DOUBLE(), BOOLEAN(), TINYINT(), REAL()})); -} - -BENCHMARK(unsafe_fixedWidth10) { - SerializeBenchmark benchmark; - benchmark.runUnsafe(ROW({ - BIGINT(), - BIGINT(), - BIGINT(), - BIGINT(), - BIGINT(), - BIGINT(), - DOUBLE(), - BIGINT(), - BIGINT(), - BIGINT(), - })); -} - -BENCHMARK_RELATIVE(container_fixedWidth10) { - SerializeBenchmark benchmark; - benchmark.runContainer(ROW({ - BIGINT(), - BIGINT(), - BIGINT(), - BIGINT(), - BIGINT(), - BIGINT(), - DOUBLE(), - BIGINT(), - BIGINT(), - BIGINT(), - })); -} - -BENCHMARK(unsafe_fixedWidth20) { - SerializeBenchmark benchmark; - benchmark.runUnsafe(ROW({ - BIGINT(), BIGINT(), BIGINT(), BIGINT(), BIGINT(), BIGINT(), BIGINT(), - BIGINT(), BIGINT(), BIGINT(), DOUBLE(), DOUBLE(), DOUBLE(), DOUBLE(), - DOUBLE(), DOUBLE(), DOUBLE(), DOUBLE(), BIGINT(), BIGINT(), - })); -} - -BENCHMARK_RELATIVE(container_fixedWidth20) { - SerializeBenchmark benchmark; - benchmark.runContainer(ROW({ - BIGINT(), BIGINT(), BIGINT(), BIGINT(), BIGINT(), BIGINT(), BIGINT(), - BIGINT(), BIGINT(), BIGINT(), DOUBLE(), DOUBLE(), DOUBLE(), DOUBLE(), - DOUBLE(), DOUBLE(), DOUBLE(), DOUBLE(), BIGINT(), BIGINT(), - })); -} - -BENCHMARK(unsafe_strings1) { - SerializeBenchmark benchmark; - benchmark.runUnsafe(ROW({BIGINT(), VARCHAR()})); -} - -BENCHMARK_RELATIVE(container_strings1) { - SerializeBenchmark benchmark; - benchmark.runContainer(ROW({BIGINT(), VARCHAR()})); -} - -BENCHMARK(unsafe_strings5) { - SerializeBenchmark benchmark; - benchmark.runUnsafe(ROW({ - BIGINT(), - VARCHAR(), - VARCHAR(), - VARCHAR(), - VARCHAR(), - VARCHAR(), - })); -} - -BENCHMARK_RELATIVE(container_strings5) { - SerializeBenchmark benchmark; - benchmark.runContainer(ROW({ - BIGINT(), - VARCHAR(), - VARCHAR(), - VARCHAR(), - VARCHAR(), - VARCHAR(), - })); -} - -BENCHMARK(unsafe_arrays) { - SerializeBenchmark benchmark; - benchmark.runUnsafe(ROW({BIGINT(), ARRAY(BIGINT())})); -} - -BENCHMARK_RELATIVE(container_arrays) { - SerializeBenchmark benchmark; - benchmark.runContainer(ROW({BIGINT(), ARRAY(BIGINT())})); -} - -BENCHMARK(unsafe_nestedArrays) { - SerializeBenchmark benchmark; - benchmark.runUnsafe(ROW({BIGINT(), ARRAY(ARRAY(BIGINT()))})); -} - -BENCHMARK_RELATIVE(container_nestedArrays) { - SerializeBenchmark benchmark; - benchmark.runContainer(ROW({BIGINT(), ARRAY(ARRAY(BIGINT()))})); -} - -BENCHMARK(unsafe_maps) { - SerializeBenchmark benchmark; - benchmark.runUnsafe(ROW({BIGINT(), MAP(BIGINT(), REAL())})); -} - -BENCHMARK_RELATIVE(container_maps) { - SerializeBenchmark benchmark; - benchmark.runContainer(ROW({BIGINT(), MAP(BIGINT(), REAL())})); -} - -BENCHMARK(unsafe_structs) { - SerializeBenchmark benchmark; - benchmark.runUnsafe( - ROW({BIGINT(), ROW({BIGINT(), DOUBLE(), BOOLEAN(), TINYINT(), REAL()})})); -} +#define SERDE_BENCHMARKS(name, rowType) \ + BENCHMARK(unsafe_serialize_##name) { \ + SerializeBenchmark benchmark; \ + benchmark.serializeUnsafe(rowType); \ + } \ + \ + BENCHMARK(compact_serialize_##name) { \ + SerializeBenchmark benchmark; \ + benchmark.serializeCompact(rowType); \ + } \ + \ + BENCHMARK(container_serialize_##name) { \ + SerializeBenchmark benchmark; \ + benchmark.serializeContainer(rowType); \ + } \ + \ + BENCHMARK(unsafe_deserialize_##name) { \ + SerializeBenchmark benchmark; \ + benchmark.deserializeUnsafe(rowType); \ + } \ + \ + BENCHMARK(compact_deserialize_##name) { \ + SerializeBenchmark benchmark; \ + benchmark.deserializeCompact(rowType); \ + } \ + \ + BENCHMARK(container_deserialize_##name) { \ + SerializeBenchmark benchmark; \ + benchmark.deserializeContainer(rowType); \ + } -BENCHMARK_RELATIVE(container_structs) { - SerializeBenchmark benchmark; - benchmark.runContainer( - ROW({BIGINT(), ROW({BIGINT(), DOUBLE(), BOOLEAN(), TINYINT(), REAL()})})); -} +SERDE_BENCHMARKS( + fixedWidth5, + ROW({BIGINT(), DOUBLE(), BOOLEAN(), TINYINT(), REAL()})); + +SERDE_BENCHMARKS( + fixedWidth10, + ROW({ + BIGINT(), + BIGINT(), + BIGINT(), + BIGINT(), + BIGINT(), + BIGINT(), + DOUBLE(), + BIGINT(), + BIGINT(), + BIGINT(), + })); + +SERDE_BENCHMARKS( + fixedWidth20, + ROW({ + BIGINT(), BIGINT(), BIGINT(), BIGINT(), BIGINT(), BIGINT(), BIGINT(), + BIGINT(), BIGINT(), BIGINT(), DOUBLE(), DOUBLE(), DOUBLE(), DOUBLE(), + DOUBLE(), DOUBLE(), DOUBLE(), DOUBLE(), BIGINT(), BIGINT(), + })); + +SERDE_BENCHMARKS(strings1, ROW({BIGINT(), VARCHAR()})); + +SERDE_BENCHMARKS( + strings5, + ROW({ + BIGINT(), + VARCHAR(), + VARCHAR(), + VARCHAR(), + VARCHAR(), + VARCHAR(), + })); + +SERDE_BENCHMARKS(arrays, ROW({BIGINT(), ARRAY(BIGINT())})); + +SERDE_BENCHMARKS(nestedArrays, ROW({BIGINT(), ARRAY(ARRAY(BIGINT()))})); + +SERDE_BENCHMARKS(maps, ROW({BIGINT(), MAP(BIGINT(), REAL())})); + +SERDE_BENCHMARKS( + structs, + ROW({BIGINT(), ROW({BIGINT(), DOUBLE(), BOOLEAN(), TINYINT(), REAL()})})); } // namespace } // namespace facebook::velox::row diff --git a/velox/row/tests/CMakeLists.txt b/velox/row/tests/CMakeLists.txt index 0f6792fb5a87..271cd1fb88cc 100644 --- a/velox/row/tests/CMakeLists.txt +++ b/velox/row/tests/CMakeLists.txt @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_executable(velox_row_test UnsafeRowFuzzTest.cpp) +add_executable(velox_row_test UnsafeRowFuzzTest.cpp CompactRowTest.cpp) add_test(velox_row_test velox_row_test) diff --git a/velox/row/tests/CompactRowTest.cpp b/velox/row/tests/CompactRowTest.cpp new file mode 100644 index 000000000000..46417d06ee8f --- /dev/null +++ b/velox/row/tests/CompactRowTest.cpp @@ -0,0 +1,518 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "velox/row/CompactRow.h" +#include "velox/vector/fuzzer/VectorFuzzer.h" +#include "velox/vector/tests/utils/VectorTestBase.h" + +using namespace facebook::velox::test; + +namespace facebook::velox::row { +namespace { + +class CompactRowTest : public ::testing::Test, public VectorTestBase { + protected: + void testRoundTrip(const RowVectorPtr& data) { + SCOPED_TRACE(data->toString()); + + auto rowType = asRowType(data->type()); + auto numRows = data->size(); + + CompactRow row(data); + + size_t totalSize = 0; + if (auto fixedRowSize = CompactRow::fixedRowSize(rowType)) { + totalSize = fixedRowSize.value() * numRows; + } else { + for (auto i = 0; i < numRows; ++i) { + totalSize += row.rowSize(i); + } + } + + std::vector serialized; + + BufferPtr buffer = AlignedBuffer::allocate(totalSize, pool(), 0); + auto* rawBuffer = buffer->asMutable(); + size_t offset = 0; + for (auto i = 0; i < numRows; ++i) { + auto size = row.serialize(i, rawBuffer + offset); + serialized.push_back(std::string_view(rawBuffer + offset, size)); + offset += size; + + VELOX_CHECK_EQ(size, row.rowSize(i), "Row {}: {}", i, data->toString(i)); + } + + VELOX_CHECK_EQ(offset, totalSize); + + auto copy = CompactRow::deserialize(serialized, rowType, pool()); + assertEqualVectors(data, copy); + } +}; + +TEST_F(CompactRowTest, fixedRowSize) { + ASSERT_EQ(1 + 1, CompactRow::fixedRowSize(ROW({BOOLEAN()}))); + ASSERT_EQ(1 + 8, CompactRow::fixedRowSize(ROW({BIGINT()}))); + ASSERT_EQ(1 + 4, CompactRow::fixedRowSize(ROW({INTEGER()}))); + ASSERT_EQ(1 + 2, CompactRow::fixedRowSize(ROW({SMALLINT()}))); + ASSERT_EQ(1 + 8, CompactRow::fixedRowSize(ROW({DOUBLE()}))); + ASSERT_EQ(std::nullopt, CompactRow::fixedRowSize(ROW({VARCHAR()}))); + ASSERT_EQ(std::nullopt, CompactRow::fixedRowSize(ROW({ARRAY(BIGINT())}))); + ASSERT_EQ( + 1 + 1 + 8 + 4 + 2 + 8, + CompactRow::fixedRowSize( + ROW({BOOLEAN(), BIGINT(), INTEGER(), SMALLINT(), DOUBLE()}))); + + ASSERT_EQ(std::nullopt, CompactRow::fixedRowSize(ROW({BIGINT(), VARCHAR()}))); + ASSERT_EQ( + std::nullopt, + CompactRow::fixedRowSize(ROW({BIGINT(), ROW({VARCHAR()})}))); + + ASSERT_EQ(1, CompactRow::fixedRowSize(ROW({UNKNOWN()}))); +} + +TEST_F(CompactRowTest, rowSizeString) { + auto data = makeRowVector({ + makeFlatVector({"a", "abc", "Longer string", "d", ""}), + }); + + CompactRow row(data); + + // 1 byte for null flags. 4 bytes for string size. N bytes for the string + // itself. + ASSERT_EQ(1 + 4 + 1, row.rowSize(0)); + ASSERT_EQ(1 + 4 + 3, row.rowSize(1)); + ASSERT_EQ(1 + 4 + 13, row.rowSize(2)); + ASSERT_EQ(1 + 4 + 1, row.rowSize(3)); + ASSERT_EQ(1 + 4 + 0, row.rowSize(4)); +} + +TEST_F(CompactRowTest, rowSizeArrayOfBigint) { + auto data = makeRowVector({ + makeArrayVector({ + {1, 2, 3}, + {4, 5}, + {}, + {6}, + }), + }); + + { + CompactRow row(data); + + // 1 byte for null flags. 4 bytes for array + // size. 1 byte for null flags for elements. N bytes for array elements. + ASSERT_EQ(1 + 4 + 1 + 8 * 3, row.rowSize(0)); + ASSERT_EQ(1 + 4 + 1 + 8 * 2, row.rowSize(1)); + ASSERT_EQ(1 + 4, row.rowSize(2)); + ASSERT_EQ(1 + 4 + 1 + 8, row.rowSize(3)); + } + + data = makeRowVector({ + makeNullableArrayVector({ + {{1, 2, std::nullopt, 3}}, + {{4, 5}}, + {{}}, + std::nullopt, + {{6}}, + }), + }); + + { + CompactRow row(data); + + // 1 byte for null flags. 4 bytes for array + // size. 1 byte for null flags for elements. N bytes for array elements. + ASSERT_EQ(1 + 4 + 1 + 8 * 4, row.rowSize(0)); + ASSERT_EQ(1 + 4 + 1 + 8 * 2, row.rowSize(1)); + ASSERT_EQ(1 + 4, row.rowSize(2)); + ASSERT_EQ(1, row.rowSize(3)); + ASSERT_EQ(1 + 4 + 1 + 8, row.rowSize(4)); + } +} + +TEST_F(CompactRowTest, rowSizeMixed) { + auto data = makeRowVector({ + makeNullableFlatVector({1, 2, 3, std::nullopt}), + makeNullableFlatVector({"a", "abc", "", std::nullopt}), + }); + + CompactRow row(data); + + // 1 byte for null flags. 8 bytes for bigint field. 4 bytes for string size. + // N bytes for the string itself. + ASSERT_EQ(1 + 8 + (4 + 1), row.rowSize(0)); + ASSERT_EQ(1 + 8 + (4 + 3), row.rowSize(1)); + ASSERT_EQ(1 + 8 + (4 + 0), row.rowSize(2)); + ASSERT_EQ(1 + 8, row.rowSize(3)); +} + +TEST_F(CompactRowTest, rowSizeArrayOfStrings) { + auto data = makeRowVector({ + makeArrayVector({ + {"a", "Abc"}, + {}, + {"a", "Longer string", "abc"}, + }), + }); + + { + CompactRow row(data); + + // 1 byte for null flags. 4 bytes for array + // size. 1 byte for nulls flags for elements. 4 bytes for serialized size. 4 + // bytes per offset of an element. N bytes for elements. Each string element + // is 4 bytes for size + string length. + ASSERT_EQ(1 + 4 + 1 + (4 + 1) + (4 + 3), row.rowSize(0)); + ASSERT_EQ(1 + 4, row.rowSize(1)); + ASSERT_EQ(1 + 4 + 1 + (4 + 1) + (4 + 13) + (4 + 3), row.rowSize(2)); + } + + data = makeRowVector({ + makeNullableArrayVector({ + {{"a", "Abc", std::nullopt}}, + {{}}, + std::nullopt, + {{"a", std::nullopt, "Longer string", "abc"}}, + }), + }); + + { + CompactRow row(data); + + // Null strings do not take space. + ASSERT_EQ(1 + 4 + 1 + (4 + 1) + (4 + 3) + 0, row.rowSize(0)); + ASSERT_EQ(1 + 4, row.rowSize(1)); + ASSERT_EQ(1, row.rowSize(2)); + ASSERT_EQ(1 + 4 + 1 + (4 + 1) + 0 + (4 + 13) + (4 + 3), row.rowSize(3)); + } +} + +TEST_F(CompactRowTest, boolean) { + auto data = makeRowVector({ + makeFlatVector( + {true, false, true, true, false, false, true, false}), + }); + + testRoundTrip(data); + + data = makeRowVector({ + makeNullableFlatVector({ + true, + false, + std::nullopt, + true, + std::nullopt, + false, + true, + false, + }), + }); + + testRoundTrip(data); +} + +TEST_F(CompactRowTest, bigint) { + auto data = makeRowVector({ + makeFlatVector({1, 2, 3, 4, 5}), + }); + + testRoundTrip(data); + + data = makeRowVector({ + makeNullableFlatVector( + {1, std::nullopt, 3, std::nullopt, 5, std::nullopt}), + }); + + testRoundTrip(data); +} + +TEST_F(CompactRowTest, hugeint) { + auto data = makeRowVector({ + makeFlatVector({1, 2, 3, 4, 5}), + }); + + testRoundTrip(data); + + data = makeRowVector({ + makeNullableFlatVector( + {std::nullopt, 1, 2, std::nullopt, std::nullopt, 3, 4, 5}), + }); + + testRoundTrip(data); +} + +Timestamp ts(int64_t micros) { + return Timestamp::fromMicros(micros); +} + +TEST_F(CompactRowTest, timestamp) { + auto data = makeRowVector({ + makeFlatVector({ + ts(0), + ts(1), + ts(2), + }), + }); + + testRoundTrip(data); + + // Serialize null Timestamp values with null flags set over a large + // non-serializable value (e.g. a value that triggers an exception in + // Timestamp::toMicros()). + data = makeRowVector({ + makeFlatVector({ + ts(0), + Timestamp::max(), + ts(123'456), + Timestamp::min(), + }), + }); + + data->childAt(0)->setNull(1, true); + data->childAt(0)->setNull(3, true); + + testRoundTrip(data); +} + +TEST_F(CompactRowTest, string) { + auto data = makeRowVector({ + makeFlatVector({"a", "Abc", "", "Longer test string"}), + }); + + testRoundTrip(data); +} + +TEST_F(CompactRowTest, unknown) { + auto data = makeRowVector({ + makeAllNullFlatVector(10), + }); + + testRoundTrip(data); + + data = makeRowVector({ + makeArrayVector({0, 3, 5, 9}, makeAllNullFlatVector(10)), + }); + + testRoundTrip(data); +} + +TEST_F(CompactRowTest, mix) { + auto data = makeRowVector({ + makeFlatVector({"a", "Abc", "", "Longer test string"}), + makeAllNullFlatVector(4), + makeFlatVector({1, 2, 3, 4}), + }); + + testRoundTrip(data); +} + +TEST_F(CompactRowTest, arrayOfBigint) { + auto data = makeRowVector({ + makeArrayVector({ + {1, 2, 3}, + {4, 5}, + {6}, + {}, + }), + }); + + testRoundTrip(data); + + data = makeRowVector({ + makeNullableArrayVector({ + {{1, 2, std::nullopt, 3}}, + {{4, 5, std::nullopt}}, + {{std::nullopt, 6}}, + {{std::nullopt}}, + std::nullopt, + {{}}, + }), + }); + + testRoundTrip(data); +} + +TEST_F(CompactRowTest, arrayOfTimestamp) { + auto data = makeRowVector({ + makeArrayVector({ + {ts(1), ts(2), ts(3)}, + {ts(4), ts(5)}, + {ts(6)}, + {}, + }), + }); + + testRoundTrip(data); + + data = makeRowVector({ + makeNullableArrayVector({ + {{ts(1), ts(2), std::nullopt, ts(3)}}, + {{ts(4), ts(5), std::nullopt}}, + {{std::nullopt, ts(6)}}, + {{std::nullopt}}, + std::nullopt, + {{}}, + }), + }); + + testRoundTrip(data); +} + +TEST_F(CompactRowTest, arrayOfString) { + auto data = makeRowVector({ + makeArrayVector({ + {"a", "abc", "Longer test string"}, + {"b", "Abc 12345 ...test", "foo"}, + {}, + }), + }); + + testRoundTrip(data); + + data = makeRowVector({ + makeNullableArrayVector({ + {{"a", std::nullopt, "abc", "Longer test string"}}, + {{std::nullopt, + "b", + std::nullopt, + "Abc 12345 ...test", + std::nullopt, + "foo"}}, + {{}}, + {{std::nullopt}}, + std::nullopt, + }), + }); + + testRoundTrip(data); +} + +TEST_F(CompactRowTest, map) { + auto data = makeRowVector({ + makeMapVector( + {{{1, 10}, {2, 20}, {3, 30}}, {{1, 11}, {2, 22}}, {{4, 444}}, {}}), + }); + + testRoundTrip(data); + + data = makeRowVector({ + makeMapVector({ + {{"a", "100"}, + {"b", "200"}, + {"Long string for testing", "Another long string"}}, + {{"abc", "300"}, {"d", "400"}}, + {}, + }), + }); + + testRoundTrip(data); +} + +TEST_F(CompactRowTest, row) { + auto data = makeRowVector({ + makeRowVector({ + makeFlatVector({1, 2, 3, 4, 5}), + makeFlatVector({1.05, 2.05, 3.05, 4.05, 5.05}), + }), + }); + + testRoundTrip(data); + + data = makeRowVector({ + makeRowVector({ + makeFlatVector({1, 2, 3, 4, 5}), + makeFlatVector( + {"a", "Abc", "Long test string", "", "d"}), + makeFlatVector({1.05, 2.05, 3.05, 4.05, 5.05}), + }), + }); + + testRoundTrip(data); + + data = makeRowVector({ + makeRowVector( + { + makeFlatVector({1, 2, 3, 4, 5}), + makeNullableFlatVector({-1, 2, -3, std::nullopt, -5}), + makeFlatVector({1.05, 2.05, 3.05, 4.05, 5.05}), + makeFlatVector( + {"a", "Abc", "Long test string", "", "d"}), + }, + nullEvery(2)), + }); + + testRoundTrip(data); +} + +TEST_F(CompactRowTest, fuzz) { + auto rowType = ROW({ + ROW({BIGINT(), VARCHAR(), DOUBLE()}), + MAP(VARCHAR(), ROW({ARRAY(BIGINT()), ARRAY(VARCHAR()), REAL()})), + ARRAY(ROW({BIGINT(), DOUBLE()})), + ARRAY(MAP(BIGINT(), DOUBLE())), + BIGINT(), + ARRAY(MAP(BIGINT(), VARCHAR())), + ARRAY(MAP(VARCHAR(), REAL())), + MAP(BIGINT(), ARRAY(BIGINT())), + BIGINT(), + ARRAY(BIGINT()), + DOUBLE(), + MAP(VARCHAR(), VARCHAR()), + VARCHAR(), + ARRAY(ARRAY(BIGINT())), + BIGINT(), + ARRAY(ARRAY(VARCHAR())), + }); + + VectorFuzzer::Options opts; + opts.vectorSize = 100; + opts.containerLength = 5; + opts.nullRatio = 0.1; + opts.containerHasNulls = true; + opts.dictionaryHasNulls = false; + opts.stringVariableLength = true; + opts.stringLength = 20; + opts.containerVariableLength = true; + opts.complexElementsMaxSize = 1'000; + + // Spark uses microseconds to store timestamp + opts.timestampPrecision = + VectorFuzzer::Options::TimestampPrecision::kMicroSeconds; + + VectorFuzzer fuzzer(opts, pool_.get()); + + const auto iterations = 200; + for (size_t i = 0; i < iterations; ++i) { + auto seed = folly::Random::rand32(); + + LOG(INFO) << i << ": seed: " << seed; + SCOPED_TRACE(fmt::format("seed: {}", seed)); + + fuzzer.reSeed(seed); + auto data = fuzzer.fuzzInputRow(rowType); + + testRoundTrip(data); + + if (Test::HasFailure()) { + break; + } + } +} + +} // namespace +} // namespace facebook::velox::row diff --git a/velox/serializers/CMakeLists.txt b/velox/serializers/CMakeLists.txt index 9fa18e048321..f9264fd3e57a 100644 --- a/velox/serializers/CMakeLists.txt +++ b/velox/serializers/CMakeLists.txt @@ -11,10 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -add_library(velox_presto_serializer PrestoSerializer.cpp - UnsafeRowSerializer.cpp) +add_library( + velox_presto_serializer PrestoSerializer.cpp UnsafeRowSerializer.cpp + CompactRowSerializer.cpp) -target_link_libraries(velox_presto_serializer velox_vector) +target_link_libraries(velox_presto_serializer velox_dwio_common velox_vector) if(${VELOX_BUILD_TESTING}) add_subdirectory(tests) diff --git a/velox/serializers/CompactRowSerializer.cpp b/velox/serializers/CompactRowSerializer.cpp new file mode 100644 index 000000000000..eca7a60d8181 --- /dev/null +++ b/velox/serializers/CompactRowSerializer.cpp @@ -0,0 +1,127 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/serializers/CompactRowSerializer.h" +#include +#include "velox/row/CompactRow.h" + +namespace facebook::velox::serializer { + +void CompactRowVectorSerde::estimateSerializedSize( + VectorPtr /* vector */, + const folly::Range& /* ranges */, + vector_size_t** /* sizes */) { + VELOX_UNSUPPORTED(); +} + +namespace { +class CompactRowVectorSerializer : public VectorSerializer { + public: + using TRowSize = uint32_t; + + explicit CompactRowVectorSerializer(StreamArena* streamArena) + : pool_{streamArena->pool()} {} + + void append( + const RowVectorPtr& vector, + const folly::Range& ranges) override { + size_t totalSize = 0; + row::CompactRow row(vector); + if (auto fixedRowSize = + row::CompactRow::fixedRowSize(asRowType(vector->type()))) { + for (const auto& range : ranges) { + totalSize += (fixedRowSize.value() + sizeof(TRowSize)) * range.size; + } + + } else { + for (const auto& range : ranges) { + for (auto i = range.begin; i < range.begin + range.size; ++i) { + totalSize += row.rowSize(i) + sizeof(TRowSize); + } + } + } + + if (totalSize == 0) { + return; + } + + BufferPtr buffer = AlignedBuffer::allocate(totalSize, pool_, 0); + auto rawBuffer = buffer->asMutable(); + buffers_.push_back(std::move(buffer)); + + size_t offset = 0; + for (auto& range : ranges) { + for (auto i = range.begin; i < range.begin + range.size; ++i) { + // Write row data. + TRowSize size = row.serialize(i, rawBuffer + offset + sizeof(TRowSize)); + + // Write raw size. Needs to be in big endian order. + *(TRowSize*)(rawBuffer + offset) = folly::Endian::big(size); + offset += sizeof(TRowSize) + size; + } + } + } + + void flush(OutputStream* stream) override { + for (const auto& buffer : buffers_) { + stream->write(buffer->as(), buffer->size()); + } + buffers_.clear(); + } + + private: + memory::MemoryPool* const FOLLY_NONNULL pool_; + std::vector buffers_; +}; +} // namespace + +std::unique_ptr CompactRowVectorSerde::createSerializer( + RowTypePtr /* type */, + int32_t /* numRows */, + StreamArena* streamArena, + const Options* /* options */) { + return std::make_unique(streamArena); +} + +void CompactRowVectorSerde::deserialize( + ByteStream* source, + velox::memory::MemoryPool* pool, + RowTypePtr type, + RowVectorPtr* result, + const Options* /* options */) { + std::vector serializedRows; + while (!source->atEnd()) { + // First read row size in big endian order. + auto rowSize = folly::Endian::big( + source->read()); + auto row = source->nextView(rowSize); + VELOX_CHECK_EQ(row.size(), rowSize); + serializedRows.push_back(row); + } + + if (serializedRows.empty()) { + *result = BaseVector::create(type, 0, pool); + return; + } + + *result = velox::row::CompactRow::deserialize(serializedRows, type, pool); +} + +// static +void CompactRowVectorSerde::registerVectorSerde() { + velox::registerVectorSerde(std::make_unique()); +} + +} // namespace facebook::velox::serializer diff --git a/velox/serializers/CompactRowSerializer.h b/velox/serializers/CompactRowSerializer.h new file mode 100644 index 000000000000..3ad0c99cbfa3 --- /dev/null +++ b/velox/serializers/CompactRowSerializer.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "velox/vector/ComplexVector.h" +#include "velox/vector/VectorStream.h" + +namespace facebook::velox::serializer { + +class CompactRowVectorSerde : public VectorSerde { + public: + CompactRowVectorSerde() = default; + // We do not implement this method since it is not used in production code. + void estimateSerializedSize( + VectorPtr vector, + const folly::Range& ranges, + vector_size_t** sizes) override; + + // This method is not used in production code. It is only used to + // support round-trip tests for deserialization. + std::unique_ptr createSerializer( + RowTypePtr type, + int32_t numRows, + StreamArena* streamArena, + const Options* options) override; + + // This method is used when reading data from the exchange. + void deserialize( + ByteStream* source, + velox::memory::MemoryPool* pool, + RowTypePtr type, + RowVectorPtr* result, + const Options* options) override; + + static void registerVectorSerde(); +}; + +} // namespace facebook::velox::serializer diff --git a/velox/serializers/PrestoSerializer.cpp b/velox/serializers/PrestoSerializer.cpp index 9c65f6d926db..904924d35f36 100644 --- a/velox/serializers/PrestoSerializer.cpp +++ b/velox/serializers/PrestoSerializer.cpp @@ -137,6 +137,18 @@ std::string typeToEncodingName(const TypePtr& type) { } } +PrestoVectorSerde::PrestoOptions toPrestoOptions( + const VectorSerde::Options* options) { + if (options == nullptr) { + return PrestoVectorSerde::PrestoOptions(); + } + return *(static_cast(options)); +} + +FOLLY_ALWAYS_INLINE bool needCompression(const folly::io::Codec& codec) { + return codec.type() != folly::io::CodecType::NO_COMPRESSION; +} + template void readValues( ByteStream* source, @@ -1558,7 +1570,10 @@ class PrestoVectorSerializer : public VectorSerializer { std::shared_ptr rowType, int32_t numRows, StreamArena* streamArena, - bool useLosslessTimestamp) { + bool useLosslessTimestamp, + common::CompressionKind compressionKind) + : streamArena_(streamArena), + codec_(common::compressionKindToCodec(compressionKind)) { auto types = rowType->children(); auto numTypes = types.size(); streams_.resize(numTypes); @@ -1580,6 +1595,9 @@ class PrestoVectorSerializer : public VectorSerializer { } } + // The SerializedPage layout is: + // numRows(4) | codec(1) | uncompressedSize(4) | compressedSize(4) | + // checksum(8) | data void flush(OutputStream* out) override { flushInternal(numRows_, false /*rle*/, out); } @@ -1596,21 +1614,18 @@ class PrestoVectorSerializer : public VectorSerializer { flushInternal(vector->size(), true /*rle*/, out); } - // Writes the contents to 'stream' in wire format - void flushInternal(int32_t numRows, bool rle, OutputStream* out) { - auto listener = dynamic_cast(out->listener()); - // Reset CRC computation - if (listener) { - listener->reset(); - } + private: + void flushUncompressed( + int32_t numRows, + bool rle, + OutputStream* out, + PrestoOutputStreamListener* listener) { + int32_t offset = out->tellp(); char codec = 0; if (listener) { codec = getCodecMarker(); } - - int32_t offset = out->tellp(); - // Pause CRC computation if (listener) { listener->pause(); @@ -1622,7 +1637,8 @@ class PrestoVectorSerializer : public VectorSerializer { // Make space for uncompressedSizeInBytes & sizeInBytes writeInt32(out, 0); writeInt32(out, 0); - writeInt64(out, 0); // Write zero checksum + // Write zero checksum. + writeInt64(out, 0); // Number of columns and stream content. Unpause CRC. if (listener) { @@ -1662,10 +1678,92 @@ class PrestoVectorSerializer : public VectorSerializer { out->seekp(offset + size); } - private: + void flushCompressed( + int32_t numRows, + bool rle, + OutputStream* output, + PrestoOutputStreamListener* listener) { + const int32_t offset = output->tellp(); + char codec = kCompressedBitMask; + if (listener) { + codec |= kCheckSumBitMask; + } + + // Pause CRC computation + if (listener) { + listener->pause(); + } + + writeInt32(output, numRows); + output->write(&codec, 1); + + IOBufOutputStream out( + *(streamArena_->pool()), nullptr, streamArena_->size()); + writeInt32(&out, streams_.size()); + if (rle) { + // Write RLE encoding marker. + writeInt32(&out, kRLE.size()); + out.write(kRLE.data(), kRLE.size()); + // Write number of RLE values. + writeInt32(&out, numRows); + } + + for (auto& stream : streams_) { + stream->flush(&out); + } + const int32_t uncompressedSize = out.tellp(); + VELOX_CHECK_LE( + uncompressedSize, + codec_->maxUncompressedLength(), + "UncompressedSize exceeds limit"); + auto compressed = codec_->compress(out.getIOBuf().get()); + const int32_t compressedSize = compressed->length(); + writeInt32(output, uncompressedSize); + writeInt32(output, compressedSize); + const int32_t crcOffset = output->tellp(); + writeInt64(output, 0); // Write zero checksum + // Number of columns and stream content. Unpause CRC. + if (listener) { + listener->resume(); + } + output->write( + reinterpret_cast(compressed->writableData()), + compressed->length()); + // Pause CRC computation + if (listener) { + listener->pause(); + } + const int32_t endSize = output->tellp(); + // Fill in crc + int64_t crc = 0; + if (listener) { + crc = computeChecksum(listener, codec, numRows, compressedSize); + } + output->seekp(crcOffset); + writeInt64(output, crc); + output->seekp(endSize); + } + + // Writes the contents to 'stream' in wire format + void flushInternal(int32_t numRows, bool rle, OutputStream* out) { + auto listener = dynamic_cast(out->listener()); + // Reset CRC computation + if (listener) { + listener->reset(); + } + + if (!needCompression(*codec_)) { + flushUncompressed(numRows, rle, out, listener); + } else { + flushCompressed(numRows, rle, out, listener); + } + } + static const int32_t kSizeInBytesOffset{4 + 1}; static const int32_t kHeaderSize{kSizeInBytesOffset + 4 + 4 + 8}; + StreamArena* const streamArena_; + const std::unique_ptr codec_; int32_t numRows_{0}; std::vector> streams_; }; @@ -1683,11 +1781,13 @@ std::unique_ptr PrestoVectorSerde::createSerializer( int32_t numRows, StreamArena* streamArena, const Options* options) { - bool useLosslessTimestamp = options != nullptr - ? static_cast(options)->useLosslessTimestamp - : false; + auto prestoOptions = toPrestoOptions(options); return std::make_unique( - type, numRows, streamArena, useLosslessTimestamp); + type, + numRows, + streamArena, + prestoOptions.useLosslessTimestamp, + prestoOptions.compressionKind); } void PrestoVectorSerde::serializeConstants( @@ -1707,9 +1807,9 @@ void PrestoVectorSerde::deserialize( std::shared_ptr type, std::shared_ptr* result, const Options* options) { - bool useLosslessTimestamp = options != nullptr - ? static_cast(options)->useLosslessTimestamp - : false; + auto prestoOptions = toPrestoOptions(options); + const bool useLosslessTimestamp = prestoOptions.useLosslessTimestamp; + auto codec = common::compressionKindToCodec(prestoOptions.compressionKind); auto numRows = source->read(); if (!(*result) || !result->unique() || (*result)->type() != type) { *result = std::dynamic_pointer_cast( @@ -1720,25 +1820,44 @@ void PrestoVectorSerde::deserialize( auto pageCodecMarker = source->read(); auto uncompressedSize = source->read(); - // skip size in bytes - source->skip(4); + auto compressedSize = source->read(); auto checksum = source->read(); int64_t actualCheckSum = 0; if (isChecksumBitSet(pageCodecMarker)) { actualCheckSum = - computeChecksum(source, pageCodecMarker, numRows, uncompressedSize); + computeChecksum(source, pageCodecMarker, numRows, compressedSize); } VELOX_CHECK_EQ( checksum, actualCheckSum, "Received corrupted serialized page."); - // skip number of columns - source->skip(4); + VELOX_CHECK_EQ( + needCompression(*codec), + isCompressedBitSet(pageCodecMarker), + "Compression kind {} should align with codec marker.", + common::compressionKindToString( + common::codecTypeToCompressionKind(codec->type()))); auto children = &(*result)->children(); auto childTypes = type->as().children(); - readColumns(source, pool, childTypes, children, useLosslessTimestamp); + if (!needCompression(*codec)) { + auto numColumns = source->read(); + readColumns(source, pool, childTypes, children, useLosslessTimestamp); + } else { + auto compressBuf = folly::IOBuf::create(compressedSize); + source->readBytes(compressBuf->writableData(), compressedSize); + compressBuf->append(compressedSize); + auto uncompress = codec->uncompress(compressBuf.get(), uncompressedSize); + ByteRange byteRange{ + uncompress->writableData(), (int32_t)uncompress->length(), 0}; + ByteStream uncompressedSource; + uncompressedSource.resetInput({byteRange}); + auto numColumns = uncompressedSource.read(); + VELOX_CHECK_EQ(numColumns, type->as().size()); + readColumns( + &uncompressedSource, pool, childTypes, children, useLosslessTimestamp); + } } // static diff --git a/velox/serializers/PrestoSerializer.h b/velox/serializers/PrestoSerializer.h index b9202ae51136..c0f6099c2a25 100644 --- a/velox/serializers/PrestoSerializer.h +++ b/velox/serializers/PrestoSerializer.h @@ -15,6 +15,7 @@ */ #pragma once #include "velox/common/base/Crc.h" +#include "velox/common/compression/Compression.h" #include "velox/vector/VectorStream.h" namespace facebook::velox::serializer::presto { @@ -22,13 +23,21 @@ class PrestoVectorSerde : public VectorSerde { public: // Input options that the serializer recognizes. struct PrestoOptions : VectorSerde::Options { - explicit PrestoOptions(bool useLosslessTimestamp) - : useLosslessTimestamp(useLosslessTimestamp) {} + PrestoOptions() = default; + + PrestoOptions( + bool _useLosslessTimestamp, + common::CompressionKind _compressionKind) + : useLosslessTimestamp(_useLosslessTimestamp), + compressionKind(_compressionKind) {} + // Currently presto only supports millisecond precision and the serializer // converts velox native timestamp to that resulting in loss of precision. // This option allows it to serialize with nanosecond precision and is // currently used for spilling. Is false by default. bool useLosslessTimestamp{false}; + common::CompressionKind compressionKind{ + common::CompressionKind::CompressionKind_NONE}; }; void estimateSerializedSize( diff --git a/velox/serializers/tests/CMakeLists.txt b/velox/serializers/tests/CMakeLists.txt index 31e70511f74f..08cddb61f961 100644 --- a/velox/serializers/tests/CMakeLists.txt +++ b/velox/serializers/tests/CMakeLists.txt @@ -14,7 +14,7 @@ add_executable( velox_presto_serializer_test PrestoOutputStreamListenerTest.cpp PrestoSerializerTest.cpp - UnsafeRowSerializerTest.cpp) + UnsafeRowSerializerTest.cpp CompactRowSerializerTest.cpp) add_test(velox_presto_serializer_test velox_presto_serializer_test) diff --git a/velox/serializers/tests/CompactRowSerializerTest.cpp b/velox/serializers/tests/CompactRowSerializerTest.cpp new file mode 100644 index 000000000000..31f21a95a7de --- /dev/null +++ b/velox/serializers/tests/CompactRowSerializerTest.cpp @@ -0,0 +1,125 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/serializers/CompactRowSerializer.h" +#include +#include "velox/vector/fuzzer/VectorFuzzer.h" +#include "velox/vector/tests/utils/VectorTestBase.h" + +namespace facebook::velox::serializer { +namespace { + +class CompactRowSerializerTest : public ::testing::Test, + public test::VectorTestBase { + protected: + void SetUp() override { + pool_ = memory::addDefaultLeafMemoryPool(); + serde_ = std::make_unique(); + } + + void serialize(RowVectorPtr rowVector, std::ostream* output) { + auto numRows = rowVector->size(); + + std::vector rows(numRows); + for (int i = 0; i < numRows; i++) { + rows[i] = IndexRange{i, 1}; + } + + auto arena = std::make_unique(pool_.get()); + auto rowType = asRowType(rowVector->type()); + auto serializer = serde_->createSerializer(rowType, numRows, arena.get()); + + serializer->append(rowVector, folly::Range(rows.data(), numRows)); + OStreamOutputStream out(output); + serializer->flush(&out); + } + + std::unique_ptr toByteStream(const std::string_view& input) { + auto byteStream = std::make_unique(); + ByteRange byteRange{ + reinterpret_cast(const_cast(input.data())), + (int32_t)input.length(), + 0}; + byteStream->resetInput({byteRange}); + return byteStream; + } + + RowVectorPtr deserialize( + const RowTypePtr& rowType, + const std::string_view& input) { + auto byteStream = toByteStream(input); + + RowVectorPtr result; + serde_->deserialize(byteStream.get(), pool_.get(), rowType, &result); + return result; + } + + void testRoundTrip(RowVectorPtr rowVector) { + std::ostringstream out; + serialize(rowVector, &out); + + auto rowType = asRowType(rowVector->type()); + auto deserialized = deserialize(rowType, out.str()); + test::assertEqualVectors(deserialized, rowVector); + } + + std::shared_ptr pool_; + std::unique_ptr serde_; +}; + +TEST_F(CompactRowSerializerTest, fuzz) { + auto rowType = ROW({ + BOOLEAN(), + TINYINT(), + SMALLINT(), + INTEGER(), + BIGINT(), + REAL(), + DOUBLE(), + VARCHAR(), + TIMESTAMP(), + ROW({VARCHAR(), INTEGER()}), + ARRAY(INTEGER()), + ARRAY(INTEGER()), + MAP(VARCHAR(), INTEGER()), + MAP(VARCHAR(), ARRAY(INTEGER())), + }); + + VectorFuzzer::Options opts; + opts.vectorSize = 5; + opts.nullRatio = 0.1; + opts.containerHasNulls = false; + opts.dictionaryHasNulls = false; + opts.stringVariableLength = true; + opts.stringLength = 20; + opts.containerVariableLength = false; + + // Spark uses microseconds to store timestamp + opts.timestampPrecision = + VectorFuzzer::Options::TimestampPrecision::kMicroSeconds; + opts.containerLength = 10; + + auto seed = folly::Random::rand32(); + + LOG(ERROR) << "Seed: " << seed; + SCOPED_TRACE(fmt::format("seed: {}", seed)); + VectorFuzzer fuzzer(opts, pool_.get(), seed); + + auto data = fuzzer.fuzzRow(rowType); + testRoundTrip(data); +} + +} // namespace +} // namespace facebook::velox::serializer diff --git a/velox/serializers/tests/PrestoSerializerTest.cpp b/velox/serializers/tests/PrestoSerializerTest.cpp index eedeb372b20b..450607ad85d6 100644 --- a/velox/serializers/tests/PrestoSerializerTest.cpp +++ b/velox/serializers/tests/PrestoSerializerTest.cpp @@ -27,8 +27,13 @@ using namespace facebook::velox; using namespace facebook::velox::test; -class PrestoSerializerTest : public ::testing::Test { +class PrestoSerializerTest + : public ::testing::TestWithParam { protected: + static void SetUpTestCase() { + serializer::presto::PrestoVectorSerde::registerVectorSerde(); + } + void SetUp() override { pool_ = memory::addDefaultLeafMemoryPool(); serde_ = std::make_unique(); @@ -52,17 +57,30 @@ class PrestoSerializerTest : public ::testing::Test { rowVector, folly::Range(rows.data(), numRows), rawRowSizes.data()); } + serializer::presto::PrestoVectorSerde::PrestoOptions getParamSerdeOptions( + const serializer::presto::PrestoVectorSerde::PrestoOptions* + serdeOptions) { + const bool useLosslessTimestamp = + serdeOptions == nullptr ? false : serdeOptions->useLosslessTimestamp; + common::CompressionKind kind = GetParam(); + serializer::presto::PrestoVectorSerde::PrestoOptions paramOptions{ + useLosslessTimestamp, kind}; + return paramOptions; + } + void serialize( const RowVectorPtr& rowVector, std::ostream* output, - const VectorSerde::Options* serdeOptions) { + const serializer::presto::PrestoVectorSerde::PrestoOptions* + serdeOptions) { sanityCheckEstimateSerializedSize(rowVector); auto arena = std::make_unique(pool_.get()); auto rowType = asRowType(rowVector->type()); auto numRows = rowVector->size(); + auto paramOptions = getParamSerdeOptions(serdeOptions); auto serializer = - serde_->createSerializer(rowType, numRows, arena.get(), serdeOptions); + serde_->createSerializer(rowType, numRows, arena.get(), ¶mOptions); serializer->append(rowVector); facebook::velox::serializer::presto::PrestoOutputStreamListener listener; @@ -73,11 +91,13 @@ class PrestoSerializerTest : public ::testing::Test { void serializeRle( const RowVectorPtr& rowVector, std::ostream* output, - const VectorSerde::Options* serdeOptions) { + const serializer::presto::PrestoVectorSerde::PrestoOptions* + serdeOptions) { facebook::velox::serializer::presto::PrestoOutputStreamListener listener; OStreamOutputStream out(output, &listener); auto arena = std::make_unique(pool_.get()); - serde_->serializeConstants(rowVector, arena.get(), serdeOptions, &out); + auto paramOptions = getParamSerdeOptions(serdeOptions); + serde_->serializeConstants(rowVector, arena.get(), ¶mOptions, &out); } std::unique_ptr toByteStream(const std::string& input) { @@ -93,12 +113,13 @@ class PrestoSerializerTest : public ::testing::Test { RowVectorPtr deserialize( const RowTypePtr& rowType, const std::string& input, - const VectorSerde::Options* serdeOptions) { + const serializer::presto::PrestoVectorSerde::PrestoOptions* + serdeOptions) { auto byteStream = toByteStream(input); - + auto paramOptions = getParamSerdeOptions(serdeOptions); RowVectorPtr result; serde_->deserialize( - byteStream.get(), pool_.get(), rowType, &result, serdeOptions); + byteStream.get(), pool_.get(), rowType, &result, ¶mOptions); return result; } @@ -115,7 +136,8 @@ class PrestoSerializerTest : public ::testing::Test { void testRoundTrip( VectorPtr vector, - const VectorSerde::Options* serdeOptions = nullptr) { + const serializer::presto::PrestoVectorSerde::PrestoOptions* serdeOptions = + nullptr) { auto rowVector = vectorMaker_->rowVector({vector}); std::ostringstream out; serialize(rowVector, &out, serdeOptions); @@ -125,13 +147,16 @@ class PrestoSerializerTest : public ::testing::Test { assertEqualVectors(deserialized, rowVector); } - void testRleRoundTrip(const VectorPtr& constantVector) { + void testRleRoundTrip( + const VectorPtr& constantVector, + const serializer::presto::PrestoVectorSerde::PrestoOptions* serdeOptions = + nullptr) { auto rowVector = vectorMaker_->rowVector({constantVector}); std::ostringstream out; - serializeRle(rowVector, &out, nullptr); + serializeRle(rowVector, &out, serdeOptions); auto rowType = asRowType(rowVector->type()); - auto deserialized = deserialize(rowType, out.str(), nullptr); + auto deserialized = deserialize(rowType, out.str(), serdeOptions); assertEqualVectors(rowVector, deserialized); } @@ -141,7 +166,7 @@ class PrestoSerializerTest : public ::testing::Test { std::unique_ptr vectorMaker_; }; -TEST_F(PrestoSerializerTest, basic) { +TEST_P(PrestoSerializerTest, basic) { vector_size_t numRows = 1'000; auto rowVector = makeTestVector(numRows); testRoundTrip(rowVector); @@ -149,7 +174,7 @@ TEST_F(PrestoSerializerTest, basic) { /// Test serialization of a dictionary vector that adds nulls to the base /// vector. -TEST_F(PrestoSerializerTest, dictionaryWithExtraNulls) { +TEST_P(PrestoSerializerTest, dictionaryWithExtraNulls) { vector_size_t size = 1'000; auto base = @@ -173,7 +198,7 @@ TEST_F(PrestoSerializerTest, dictionaryWithExtraNulls) { testRoundTrip(dictionary); } -TEST_F(PrestoSerializerTest, emptyPage) { +TEST_P(PrestoSerializerTest, emptyPage) { auto rowVector = vectorMaker_->rowVector(ROW({"a"}, {BIGINT()}), 0); std::ostringstream out; @@ -184,7 +209,7 @@ TEST_F(PrestoSerializerTest, emptyPage) { assertEqualVectors(deserialized, rowVector); } -TEST_F(PrestoSerializerTest, emptyArray) { +TEST_P(PrestoSerializerTest, emptyArray) { auto arrayVector = vectorMaker_->arrayVector( 1'000, [](vector_size_t row) { return row % 5; }, @@ -193,7 +218,7 @@ TEST_F(PrestoSerializerTest, emptyArray) { testRoundTrip(arrayVector); } -TEST_F(PrestoSerializerTest, emptyMap) { +TEST_P(PrestoSerializerTest, emptyMap) { auto mapVector = vectorMaker_->mapVector( 1'000, [](vector_size_t row) { return row % 5; }, @@ -203,7 +228,7 @@ TEST_F(PrestoSerializerTest, emptyMap) { testRoundTrip(mapVector); } -TEST_F(PrestoSerializerTest, timestampWithTimeZone) { +TEST_P(PrestoSerializerTest, timestampWithTimeZone) { auto timestamp = vectorMaker_->flatVector( 100, [](auto row) { return 10'000 + row; }); auto timezone = @@ -225,7 +250,7 @@ TEST_F(PrestoSerializerTest, timestampWithTimeZone) { testRoundTrip(vector); } -TEST_F(PrestoSerializerTest, intervalDayTime) { +TEST_P(PrestoSerializerTest, intervalDayTime) { auto vector = vectorMaker_->flatVector( 100, [](auto row) { return row + folly::Random::rand32(); }, @@ -241,7 +266,7 @@ TEST_F(PrestoSerializerTest, intervalDayTime) { testRoundTrip(vector); } -TEST_F(PrestoSerializerTest, unknown) { +TEST_P(PrestoSerializerTest, unknown) { const vector_size_t size = 123; auto constantVector = BaseVector::createNullConstant(UNKNOWN(), 123, pool_.get()); @@ -254,7 +279,7 @@ TEST_F(PrestoSerializerTest, unknown) { testRoundTrip(flatVector); } -TEST_F(PrestoSerializerTest, multiPage) { +TEST_P(PrestoSerializerTest, multiPage) { std::ostringstream out; // page 1 @@ -275,27 +300,29 @@ TEST_F(PrestoSerializerTest, multiPage) { auto byteStream = toByteStream(bytes); RowVectorPtr deserialized; + auto paramOptions = getParamSerdeOptions(nullptr); serde_->deserialize( - byteStream.get(), pool_.get(), rowType, &deserialized, nullptr); + byteStream.get(), pool_.get(), rowType, &deserialized, ¶mOptions); ASSERT_FALSE(byteStream->atEnd()); assertEqualVectors(deserialized, a); serde_->deserialize( - byteStream.get(), pool_.get(), rowType, &deserialized, nullptr); + byteStream.get(), pool_.get(), rowType, &deserialized, ¶mOptions); assertEqualVectors(deserialized, b); ASSERT_FALSE(byteStream->atEnd()); serde_->deserialize( - byteStream.get(), pool_.get(), rowType, &deserialized, nullptr); + byteStream.get(), pool_.get(), rowType, &deserialized, ¶mOptions); assertEqualVectors(deserialized, c); ASSERT_TRUE(byteStream->atEnd()); } -TEST_F(PrestoSerializerTest, timestampWithNanosecondPrecision) { +TEST_P(PrestoSerializerTest, timestampWithNanosecondPrecision) { // Verify that nanosecond precision is preserved when the right options are // passed to the serde. const serializer::presto::PrestoVectorSerde::PrestoOptions - kUseLosslessTimestampOptions(true); + kUseLosslessTimestampOptions( + true, common::CompressionKind::CompressionKind_NONE); auto timestamp = vectorMaker_->flatVector( {Timestamp{0, 0}, Timestamp{12, 0}, @@ -321,7 +348,7 @@ TEST_F(PrestoSerializerTest, timestampWithNanosecondPrecision) { assertEqualVectors(deserialized, expectedOutputWithLostPrecision); } -TEST_F(PrestoSerializerTest, longDecimal) { +TEST_P(PrestoSerializerTest, longDecimal) { std::vector decimalValues(102); decimalValues[0] = DecimalUtil::kLongDecimalMin; for (int row = 1; row < 101; row++) { @@ -340,7 +367,7 @@ TEST_F(PrestoSerializerTest, longDecimal) { testRoundTrip(vector); } -TEST_F(PrestoSerializerTest, rle) { +TEST_P(PrestoSerializerTest, rle) { // Test RLE vectors with non-null value. testRleRoundTrip( BaseVector::createConstant(BOOLEAN(), true, 12, pool_.get())); @@ -369,7 +396,7 @@ TEST_F(PrestoSerializerTest, rle) { MAP(VARCHAR(), INTEGER()), 17, pool_.get())); } -TEST_F(PrestoSerializerTest, lazy) { +TEST_P(PrestoSerializerTest, lazy) { constexpr int kSize = 1000; auto rowVector = makeTestVector(kSize); auto lazyVector = std::make_shared( @@ -380,9 +407,7 @@ TEST_F(PrestoSerializerTest, lazy) { testRoundTrip(lazyVector); } -TEST_F(PrestoSerializerTest, ioBufRoundTrip) { - serializer::presto::PrestoVectorSerde::registerVectorSerde(); - +TEST_P(PrestoSerializerTest, ioBufRoundTrip) { VectorFuzzer::Options opts; opts.timestampPrecision = VectorFuzzer::Options::TimestampPrecision::kMilliSeconds; @@ -400,3 +425,13 @@ TEST_F(PrestoSerializerTest, ioBufRoundTrip) { assertEqualVectors(inputRowVector, outputRowVector); } } + +INSTANTIATE_TEST_SUITE_P( + PrestoSerializerTest, + PrestoSerializerTest, + ::testing::Values( + common::CompressionKind::CompressionKind_NONE, + common::CompressionKind::CompressionKind_ZLIB, + common::CompressionKind::CompressionKind_SNAPPY, + common::CompressionKind::CompressionKind_ZSTD, + common::CompressionKind::CompressionKind_LZ4));